You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/07/13 21:32:13 UTC
[2/2] systemml git commit: [SYSTEML-1758] added cbind and rbind for
GPU
[SYSTEML-1758] added cbind and rbind for GPU
Closes #570
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4e47b5e1
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4e47b5e1
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4e47b5e1
Branch: refs/heads/master
Commit: 4e47b5e10ff1abdf1ef53c2b1b0d80614ec8e416
Parents: cd1ae5b
Author: Nakul Jindal <na...@gmail.com>
Authored: Thu Jul 13 14:31:47 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Thu Jul 13 14:31:47 2017 -0700
----------------------------------------------------------------------
src/main/cpp/kernels/SystemML.cu | 78 +-
src/main/cpp/kernels/SystemML.ptx | 1043 ++++++++++--------
.../java/org/apache/sysml/hops/BinaryOp.java | 21 +-
src/main/java/org/apache/sysml/lops/Append.java | 95 ++
.../java/org/apache/sysml/lops/AppendCP.java | 93 --
.../instructions/CPInstructionParser.java | 4 +-
.../instructions/GPUInstructionParser.java | 17 +-
.../gpu/BuiltinUnaryGPUInstruction.java | 2 +-
.../instructions/gpu/GPUInstruction.java | 3 +
.../gpu/MatrixAppendGPUInstruction.java | 102 ++
.../runtime/matrix/data/LibMatrixCUDA.java | 109 +-
.../org/apache/sysml/test/gpu/AppendTest.java | 108 ++
.../test/integration/gpu/ZPackageSuite.java | 2 +
13 files changed, 1099 insertions(+), 578 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 3098282..297269f 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -216,7 +216,7 @@ __global__ void matrix_matrix_cellwise_op(double* A, double* B, double* C,
bIndex = iy; // rlen == 1
C[outIndex] = binaryOp(A[aIndex], B[bIndex], op);
//printf("C[%d] = A[%d](%f) B[%d](%f) (%d %d)\n", outIndex, aIndex, A[aIndex], bIndex, B[bIndex], (ix+1), (iy+1));
- __syncthreads();
+ __syncthreads();
}
}
@@ -238,9 +238,9 @@ __global__ void matrix_scalar_op(double* A, double scalar, double* C, int size,
C[index] = binaryOp(scalar, A[index], op);
} else {
C[index] = binaryOp(A[index], scalar, op);
- }
+ }
}
- __syncthreads();
+ __syncthreads();
}
@@ -259,6 +259,78 @@ __global__ void fill(double* A, double scalar, int lenA) {
}
/**
+ * Appends Matrix B to the right side of Matrix A into a new matrix C
+ * | 1 2 3 4 | | 8 8 8 | | 1 2 3 4 8 8 8 |
+ * cbind ( | 9 8 7 6 | , | 7 7 7 | ) = | 9 8 7 6 7 7 7 |
+ * | 4 3 2 1 | | 9 9 9 | | 4 3 2 1 9 9 9 |
+ * @param A input matrix A allocated on the GPU
+ * @param B input matrix B allocated on the GPU
+ * @param C input matrix C allocated on the GPU
+ * @param rowsA rows in A
+ * @param colsA columns in A
+ * @param rowsB rows in B
+ * @param colsB columns in B
+ */
+extern "C"
+__global__ void cbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) {
+ int ix = blockIdx.x * blockDim.x + threadIdx.x;
+ int iy = blockIdx.y * blockDim.y + threadIdx.y;
+
+ int colsC = colsA + colsB;
+ int rowsC = rowsA;
+
+ // Copy an element of A into C into the appropriate location
+ if (ix < rowsA && iy < colsA) {
+ double elemA = A[ix * colsA + iy];
+ C[ix * colsC + iy] = elemA;
+ }
+
+ // Copy an element of B into C into the appropriate location
+ if (ix < rowsB && iy < colsB) {
+ double elemB = B[ix * colsB + iy];
+ C[ix * colsC + (iy + colsA)] = elemB;
+ }
+}
+
+
+/**
+ * Appends Matrix B to the bottom of Matrix A into a new matrix C
+ * | 2 3 4 | | 8 8 8 | | 2 3 4 |
+ * rbind ( | 8 7 6 | , | 7 7 7 | ) = | 8 7 6 |
+ * | 3 2 1 | | 3 2 1 |
+ | 8 8 8 |
+ | 7 7 7 |
+ * @param A input matrix A allocated on the GPU
+ * @param B input matrix B allocated on the GPU
+ * @param C input matrix C allocated on the GPU
+ * @param rowsA rows in A
+ * @param colsA columns in A
+ * @param rowsB rows in B
+ * @param colsB columns in B
+ */
+extern "C"
+__global__ void rbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) {
+ int ix = blockIdx.x * blockDim.x + threadIdx.x;
+ int iy = blockIdx.y * blockDim.y + threadIdx.y;
+
+ int rowsC = rowsA + rowsB;
+ int colsC = colsA;
+
+ // Copy an element of A into C into the appropriate location
+ if (ix < rowsA && iy < colsA) {
+ double elemA = A[ix * colsA + iy];
+ C[ix * colsC + iy] = elemA;
+ }
+
+ // Copy an element of B into C into the appropriate location
+ if (ix < rowsB && iy < colsB) {
+ double elemB = B[ix * colsB + iy];
+ C[(ix + rowsA) * colsC + iy] = elemB;
+ }
+}
+
+
+/**
* Does a reduce operation over all elements of the array.
* This method has been adapted from the Reduction sample in the NVIDIA CUDA Samples (v8.0)
* and the Reduction example available through jcuda.org
http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index ab43758..6884d5b 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1813,6 +1813,151 @@ BB9_2:
ret;
}
+ // .globl cbind
+.visible .entry cbind(
+ .param .u64 cbind_param_0,
+ .param .u64 cbind_param_1,
+ .param .u64 cbind_param_2,
+ .param .u32 cbind_param_3,
+ .param .u32 cbind_param_4,
+ .param .u32 cbind_param_5,
+ .param .u32 cbind_param_6
+)
+{
+ .reg .pred %p<7>;
+ .reg .b32 %r<19>;
+ .reg .f64 %fd<3>;
+ .reg .b64 %rd<15>;
+
+
+ ld.param.u64 %rd2, [cbind_param_0];
+ ld.param.u64 %rd3, [cbind_param_1];
+ ld.param.u64 %rd4, [cbind_param_2];
+ ld.param.u32 %r7, [cbind_param_3];
+ ld.param.u32 %r4, [cbind_param_4];
+ ld.param.u32 %r5, [cbind_param_5];
+ ld.param.u32 %r6, [cbind_param_6];
+ cvta.to.global.u64 %rd1, %rd4;
+ mov.u32 %r8, %ntid.x;
+ mov.u32 %r9, %ctaid.x;
+ mov.u32 %r10, %tid.x;
+ mad.lo.s32 %r1, %r8, %r9, %r10;
+ mov.u32 %r11, %ntid.y;
+ mov.u32 %r12, %ctaid.y;
+ mov.u32 %r13, %tid.y;
+ mad.lo.s32 %r2, %r11, %r12, %r13;
+ add.s32 %r3, %r6, %r4;
+ setp.lt.s32 %p1, %r1, %r7;
+ setp.lt.s32 %p2, %r2, %r4;
+ and.pred %p3, %p1, %p2;
+ @!%p3 bra BB10_2;
+ bra.uni BB10_1;
+
+BB10_1:
+ cvta.to.global.u64 %rd5, %rd2;
+ mad.lo.s32 %r14, %r1, %r4, %r2;
+ mul.wide.s32 %rd6, %r14, 8;
+ add.s64 %rd7, %rd5, %rd6;
+ ld.global.f64 %fd1, [%rd7];
+ mad.lo.s32 %r15, %r1, %r3, %r2;
+ mul.wide.s32 %rd8, %r15, 8;
+ add.s64 %rd9, %rd1, %rd8;
+ st.global.f64 [%rd9], %fd1;
+
+BB10_2:
+ setp.lt.s32 %p4, %r1, %r5;
+ setp.lt.s32 %p5, %r2, %r6;
+ and.pred %p6, %p4, %p5;
+ @!%p6 bra BB10_4;
+ bra.uni BB10_3;
+
+BB10_3:
+ cvta.to.global.u64 %rd10, %rd3;
+ mad.lo.s32 %r16, %r1, %r6, %r2;
+ mul.wide.s32 %rd11, %r16, 8;
+ add.s64 %rd12, %rd10, %rd11;
+ ld.global.f64 %fd2, [%rd12];
+ mad.lo.s32 %r17, %r1, %r3, %r4;
+ add.s32 %r18, %r17, %r2;
+ mul.wide.s32 %rd13, %r18, 8;
+ add.s64 %rd14, %rd1, %rd13;
+ st.global.f64 [%rd14], %fd2;
+
+BB10_4:
+ ret;
+}
+
+ // .globl rbind
+.visible .entry rbind(
+ .param .u64 rbind_param_0,
+ .param .u64 rbind_param_1,
+ .param .u64 rbind_param_2,
+ .param .u32 rbind_param_3,
+ .param .u32 rbind_param_4,
+ .param .u32 rbind_param_5,
+ .param .u32 rbind_param_6
+)
+{
+ .reg .pred %p<7>;
+ .reg .b32 %r<17>;
+ .reg .f64 %fd<3>;
+ .reg .b64 %rd<14>;
+
+
+ ld.param.u64 %rd2, [rbind_param_0];
+ ld.param.u64 %rd3, [rbind_param_1];
+ ld.param.u64 %rd4, [rbind_param_2];
+ ld.param.u32 %r3, [rbind_param_3];
+ ld.param.u32 %r4, [rbind_param_4];
+ ld.param.u32 %r5, [rbind_param_5];
+ ld.param.u32 %r6, [rbind_param_6];
+ cvta.to.global.u64 %rd1, %rd4;
+ mov.u32 %r7, %ntid.x;
+ mov.u32 %r8, %ctaid.x;
+ mov.u32 %r9, %tid.x;
+ mad.lo.s32 %r1, %r7, %r8, %r9;
+ mov.u32 %r10, %ntid.y;
+ mov.u32 %r11, %ctaid.y;
+ mov.u32 %r12, %tid.y;
+ mad.lo.s32 %r2, %r10, %r11, %r12;
+ setp.lt.s32 %p1, %r1, %r3;
+ setp.lt.s32 %p2, %r2, %r4;
+ and.pred %p3, %p1, %p2;
+ @!%p3 bra BB11_2;
+ bra.uni BB11_1;
+
+BB11_1:
+ cvta.to.global.u64 %rd5, %rd2;
+ mad.lo.s32 %r13, %r1, %r4, %r2;
+ mul.wide.s32 %rd6, %r13, 8;
+ add.s64 %rd7, %rd5, %rd6;
+ ld.global.f64 %fd1, [%rd7];
+ add.s64 %rd8, %rd1, %rd6;
+ st.global.f64 [%rd8], %fd1;
+
+BB11_2:
+ setp.lt.s32 %p4, %r1, %r5;
+ setp.lt.s32 %p5, %r2, %r6;
+ and.pred %p6, %p4, %p5;
+ @!%p6 bra BB11_4;
+ bra.uni BB11_3;
+
+BB11_3:
+ cvta.to.global.u64 %rd9, %rd3;
+ mad.lo.s32 %r14, %r1, %r6, %r2;
+ mul.wide.s32 %rd10, %r14, 8;
+ add.s64 %rd11, %rd9, %rd10;
+ ld.global.f64 %fd2, [%rd11];
+ add.s32 %r15, %r1, %r3;
+ mad.lo.s32 %r16, %r15, %r4, %r2;
+ mul.wide.s32 %rd12, %r16, 8;
+ add.s64 %rd13, %rd1, %rd12;
+ st.global.f64 [%rd13], %fd2;
+
+BB11_4:
+ ret;
+}
+
// .globl reduce_sum
.visible .entry reduce_sum(
.param .u64 reduce_sum_param_0,
@@ -1837,9 +1982,9 @@ BB9_2:
mov.f64 %fd76, 0d0000000000000000;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB10_4;
+ @%p1 bra BB12_4;
-BB10_1:
+BB12_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
@@ -1848,23 +1993,23 @@ BB10_1:
add.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB10_3;
+ @%p2 bra BB12_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
add.f64 %fd78, %fd78, %fd31;
-BB10_3:
+BB12_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB10_1;
+ @%p3 bra BB12_1;
-BB10_4:
+BB12_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
@@ -1872,130 +2017,130 @@ BB10_4:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB10_8;
+ @%p4 bra BB12_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB10_7;
+ @%p5 bra BB12_7;
ld.shared.f64 %fd32, [%rd1+4096];
add.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
-BB10_7:
+BB12_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB10_8:
+BB12_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB10_12;
+ @%p6 bra BB12_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB10_11;
+ @%p7 bra BB12_11;
ld.shared.f64 %fd33, [%rd1+2048];
add.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
-BB10_11:
+BB12_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB10_12:
+BB12_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB10_16;
+ @%p8 bra BB12_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB10_15;
+ @%p9 bra BB12_15;
ld.shared.f64 %fd34, [%rd1+1024];
add.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
-BB10_15:
+BB12_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB10_16:
+BB12_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB10_20;
+ @%p10 bra BB12_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB10_19;
+ @%p11 bra BB12_19;
ld.shared.f64 %fd35, [%rd1+512];
add.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
-BB10_19:
+BB12_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB10_20:
+BB12_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB10_33;
+ @%p12 bra BB12_33;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB10_23;
+ @%p13 bra BB12_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
add.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
-BB10_23:
+BB12_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB10_25;
+ @%p14 bra BB12_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
add.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
-BB10_25:
+BB12_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB10_27;
+ @%p15 bra BB12_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
add.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
-BB10_27:
+BB12_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB10_29;
+ @%p16 bra BB12_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
add.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
-BB10_29:
+BB12_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB10_31;
+ @%p17 bra BB12_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
add.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
-BB10_31:
+BB12_31:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB10_33;
+ @%p18 bra BB12_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
add.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
-BB10_33:
+BB12_33:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB10_35;
+ @%p19 bra BB12_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
@@ -2003,7 +2148,7 @@ BB10_33:
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
-BB10_35:
+BB12_35:
ret;
}
@@ -2027,17 +2172,17 @@ BB10_35:
ld.param.u32 %r4, [reduce_row_sum_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
- @%p1 bra BB11_35;
+ @%p1 bra BB13_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd72, 0d0000000000000000;
mov.f64 %fd73, %fd72;
setp.ge.u32 %p2, %r38, %r4;
- @%p2 bra BB11_4;
+ @%p2 bra BB13_4;
cvta.to.global.u64 %rd3, %rd1;
-BB11_3:
+BB13_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
@@ -2047,9 +2192,9 @@ BB11_3:
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd72, %fd73;
- @%p3 bra BB11_3;
+ @%p3 bra BB13_3;
-BB11_4:
+BB13_4:
mov.f64 %fd70, %fd72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
@@ -2059,130 +2204,130 @@ BB11_4:
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
- @%p4 bra BB11_8;
+ @%p4 bra BB13_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd71, %fd70;
- @%p5 bra BB11_7;
+ @%p5 bra BB13_7;
ld.shared.f64 %fd29, [%rd8+4096];
add.f64 %fd71, %fd70, %fd29;
st.shared.f64 [%rd8], %fd71;
-BB11_7:
+BB13_7:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB11_8:
+BB13_8:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p6, %r11, 512;
- @%p6 bra BB11_12;
+ @%p6 bra BB13_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd69, %fd68;
- @%p7 bra BB11_11;
+ @%p7 bra BB13_11;
ld.shared.f64 %fd30, [%rd8+2048];
add.f64 %fd69, %fd68, %fd30;
st.shared.f64 [%rd8], %fd69;
-BB11_11:
+BB13_11:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB11_12:
+BB13_12:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p8, %r11, 256;
- @%p8 bra BB11_16;
+ @%p8 bra BB13_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd67, %fd66;
- @%p9 bra BB11_15;
+ @%p9 bra BB13_15;
ld.shared.f64 %fd31, [%rd8+1024];
add.f64 %fd67, %fd66, %fd31;
st.shared.f64 [%rd8], %fd67;
-BB11_15:
+BB13_15:
mov.f64 %fd66, %fd67;
bar.sync 0;
-BB11_16:
+BB13_16:
mov.f64 %fd64, %fd66;
setp.lt.u32 %p10, %r11, 128;
- @%p10 bra BB11_20;
+ @%p10 bra BB13_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd65, %fd64;
- @%p11 bra BB11_19;
+ @%p11 bra BB13_19;
ld.shared.f64 %fd32, [%rd8+512];
add.f64 %fd65, %fd64, %fd32;
st.shared.f64 [%rd8], %fd65;
-BB11_19:
+BB13_19:
mov.f64 %fd64, %fd65;
bar.sync 0;
-BB11_20:
+BB13_20:
mov.f64 %fd63, %fd64;
setp.gt.u32 %p12, %r10, 31;
- @%p12 bra BB11_33;
+ @%p12 bra BB13_33;
setp.lt.u32 %p13, %r11, 64;
- @%p13 bra BB11_23;
+ @%p13 bra BB13_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
add.f64 %fd63, %fd63, %fd33;
st.volatile.shared.f64 [%rd8], %fd63;
-BB11_23:
+BB13_23:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p14, %r11, 32;
- @%p14 bra BB11_25;
+ @%p14 bra BB13_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
add.f64 %fd62, %fd62, %fd34;
st.volatile.shared.f64 [%rd8], %fd62;
-BB11_25:
+BB13_25:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p15, %r11, 16;
- @%p15 bra BB11_27;
+ @%p15 bra BB13_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
add.f64 %fd61, %fd61, %fd35;
st.volatile.shared.f64 [%rd8], %fd61;
-BB11_27:
+BB13_27:
mov.f64 %fd60, %fd61;
setp.lt.u32 %p16, %r11, 8;
- @%p16 bra BB11_29;
+ @%p16 bra BB13_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
add.f64 %fd60, %fd60, %fd36;
st.volatile.shared.f64 [%rd8], %fd60;
-BB11_29:
+BB13_29:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p17, %r11, 4;
- @%p17 bra BB11_31;
+ @%p17 bra BB13_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
add.f64 %fd59, %fd59, %fd37;
st.volatile.shared.f64 [%rd8], %fd59;
-BB11_31:
+BB13_31:
setp.lt.u32 %p18, %r11, 2;
- @%p18 bra BB11_33;
+ @%p18 bra BB13_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
add.f64 %fd39, %fd59, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
-BB11_33:
+BB13_33:
setp.ne.s32 %p19, %r10, 0;
- @%p19 bra BB11_35;
+ @%p19 bra BB13_35;
ld.shared.f64 %fd40, [sdata];
cvta.to.global.u64 %rd39, %rd2;
@@ -2190,7 +2335,7 @@ BB11_33:
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd40;
-BB11_35:
+BB13_35:
ret;
}
@@ -2217,18 +2362,18 @@ BB11_35:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB12_5;
+ @%p1 bra BB14_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd8, 0d0000000000000000;
mov.f64 %fd9, %fd8;
setp.ge.u32 %p2, %r1, %r2;
- @%p2 bra BB12_4;
+ @%p2 bra BB14_4;
mov.u32 %r10, %r1;
-BB12_3:
+BB14_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
@@ -2238,15 +2383,15 @@ BB12_3:
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd8, %fd9;
- @%p3 bra BB12_3;
+ @%p3 bra BB14_3;
-BB12_4:
+BB14_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
-BB12_5:
+BB14_5:
ret;
}
@@ -2274,9 +2419,9 @@ BB12_5:
mov.f64 %fd76, 0dFFEFFFFFFFFFFFFF;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB13_4;
+ @%p1 bra BB15_4;
-BB13_1:
+BB15_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
@@ -2285,23 +2430,23 @@ BB13_1:
max.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB13_3;
+ @%p2 bra BB15_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
max.f64 %fd78, %fd78, %fd31;
-BB13_3:
+BB15_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB13_1;
+ @%p3 bra BB15_1;
-BB13_4:
+BB15_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
@@ -2309,130 +2454,130 @@ BB13_4:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB13_8;
+ @%p4 bra BB15_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB13_7;
+ @%p5 bra BB15_7;
ld.shared.f64 %fd32, [%rd1+4096];
max.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
-BB13_7:
+BB15_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB13_8:
+BB15_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB13_12;
+ @%p6 bra BB15_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB13_11;
+ @%p7 bra BB15_11;
ld.shared.f64 %fd33, [%rd1+2048];
max.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
-BB13_11:
+BB15_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB13_12:
+BB15_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB13_16;
+ @%p8 bra BB15_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB13_15;
+ @%p9 bra BB15_15;
ld.shared.f64 %fd34, [%rd1+1024];
max.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
-BB13_15:
+BB15_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB13_16:
+BB15_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB13_20;
+ @%p10 bra BB15_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB13_19;
+ @%p11 bra BB15_19;
ld.shared.f64 %fd35, [%rd1+512];
max.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
-BB13_19:
+BB15_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB13_20:
+BB15_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB13_33;
+ @%p12 bra BB15_33;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB13_23;
+ @%p13 bra BB15_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
max.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
-BB13_23:
+BB15_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB13_25;
+ @%p14 bra BB15_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
max.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
-BB13_25:
+BB15_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB13_27;
+ @%p15 bra BB15_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
max.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
-BB13_27:
+BB15_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB13_29;
+ @%p16 bra BB15_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
max.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
-BB13_29:
+BB15_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB13_31;
+ @%p17 bra BB15_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
max.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
-BB13_31:
+BB15_31:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB13_33;
+ @%p18 bra BB15_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
max.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
-BB13_33:
+BB15_33:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB13_35;
+ @%p19 bra BB15_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
@@ -2440,7 +2585,7 @@ BB13_33:
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
-BB13_35:
+BB15_35:
ret;
}
@@ -2464,17 +2609,17 @@ BB13_35:
ld.param.u32 %r4, [reduce_row_max_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
- @%p1 bra BB14_35;
+ @%p1 bra BB16_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd72, 0dFFEFFFFFFFFFFFFF;
mov.f64 %fd73, %fd72;
setp.ge.u32 %p2, %r38, %r4;
- @%p2 bra BB14_4;
+ @%p2 bra BB16_4;
cvta.to.global.u64 %rd3, %rd1;
-BB14_3:
+BB16_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
@@ -2484,9 +2629,9 @@ BB14_3:
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd72, %fd73;
- @%p3 bra BB14_3;
+ @%p3 bra BB16_3;
-BB14_4:
+BB16_4:
mov.f64 %fd70, %fd72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
@@ -2496,130 +2641,130 @@ BB14_4:
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
- @%p4 bra BB14_8;
+ @%p4 bra BB16_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd71, %fd70;
- @%p5 bra BB14_7;
+ @%p5 bra BB16_7;
ld.shared.f64 %fd29, [%rd8+4096];
max.f64 %fd71, %fd70, %fd29;
st.shared.f64 [%rd8], %fd71;
-BB14_7:
+BB16_7:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB14_8:
+BB16_8:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p6, %r11, 512;
- @%p6 bra BB14_12;
+ @%p6 bra BB16_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd69, %fd68;
- @%p7 bra BB14_11;
+ @%p7 bra BB16_11;
ld.shared.f64 %fd30, [%rd8+2048];
max.f64 %fd69, %fd68, %fd30;
st.shared.f64 [%rd8], %fd69;
-BB14_11:
+BB16_11:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB14_12:
+BB16_12:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p8, %r11, 256;
- @%p8 bra BB14_16;
+ @%p8 bra BB16_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd67, %fd66;
- @%p9 bra BB14_15;
+ @%p9 bra BB16_15;
ld.shared.f64 %fd31, [%rd8+1024];
max.f64 %fd67, %fd66, %fd31;
st.shared.f64 [%rd8], %fd67;
-BB14_15:
+BB16_15:
mov.f64 %fd66, %fd67;
bar.sync 0;
-BB14_16:
+BB16_16:
mov.f64 %fd64, %fd66;
setp.lt.u32 %p10, %r11, 128;
- @%p10 bra BB14_20;
+ @%p10 bra BB16_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd65, %fd64;
- @%p11 bra BB14_19;
+ @%p11 bra BB16_19;
ld.shared.f64 %fd32, [%rd8+512];
max.f64 %fd65, %fd64, %fd32;
st.shared.f64 [%rd8], %fd65;
-BB14_19:
+BB16_19:
mov.f64 %fd64, %fd65;
bar.sync 0;
-BB14_20:
+BB16_20:
mov.f64 %fd63, %fd64;
setp.gt.u32 %p12, %r10, 31;
- @%p12 bra BB14_33;
+ @%p12 bra BB16_33;
setp.lt.u32 %p13, %r11, 64;
- @%p13 bra BB14_23;
+ @%p13 bra BB16_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
max.f64 %fd63, %fd63, %fd33;
st.volatile.shared.f64 [%rd8], %fd63;
-BB14_23:
+BB16_23:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p14, %r11, 32;
- @%p14 bra BB14_25;
+ @%p14 bra BB16_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
max.f64 %fd62, %fd62, %fd34;
st.volatile.shared.f64 [%rd8], %fd62;
-BB14_25:
+BB16_25:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p15, %r11, 16;
- @%p15 bra BB14_27;
+ @%p15 bra BB16_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
max.f64 %fd61, %fd61, %fd35;
st.volatile.shared.f64 [%rd8], %fd61;
-BB14_27:
+BB16_27:
mov.f64 %fd60, %fd61;
setp.lt.u32 %p16, %r11, 8;
- @%p16 bra BB14_29;
+ @%p16 bra BB16_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
max.f64 %fd60, %fd60, %fd36;
st.volatile.shared.f64 [%rd8], %fd60;
-BB14_29:
+BB16_29:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p17, %r11, 4;
- @%p17 bra BB14_31;
+ @%p17 bra BB16_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
max.f64 %fd59, %fd59, %fd37;
st.volatile.shared.f64 [%rd8], %fd59;
-BB14_31:
+BB16_31:
setp.lt.u32 %p18, %r11, 2;
- @%p18 bra BB14_33;
+ @%p18 bra BB16_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
max.f64 %fd39, %fd59, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
-BB14_33:
+BB16_33:
setp.ne.s32 %p19, %r10, 0;
- @%p19 bra BB14_35;
+ @%p19 bra BB16_35;
ld.shared.f64 %fd40, [sdata];
cvta.to.global.u64 %rd39, %rd2;
@@ -2627,7 +2772,7 @@ BB14_33:
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd40;
-BB14_35:
+BB16_35:
ret;
}
@@ -2654,18 +2799,18 @@ BB14_35:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB15_5;
+ @%p1 bra BB17_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd8, 0dFFEFFFFFFFFFFFFF;
mov.f64 %fd9, %fd8;
setp.ge.u32 %p2, %r1, %r2;
- @%p2 bra BB15_4;
+ @%p2 bra BB17_4;
mov.u32 %r10, %r1;
-BB15_3:
+BB17_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
@@ -2675,15 +2820,15 @@ BB15_3:
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd8, %fd9;
- @%p3 bra BB15_3;
+ @%p3 bra BB17_3;
-BB15_4:
+BB17_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
-BB15_5:
+BB17_5:
ret;
}
@@ -2711,9 +2856,9 @@ BB15_5:
mov.f64 %fd76, 0d7FEFFFFFFFFFFFFF;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB16_4;
+ @%p1 bra BB18_4;
-BB16_1:
+BB18_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
@@ -2722,23 +2867,23 @@ BB16_1:
min.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB16_3;
+ @%p2 bra BB18_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
min.f64 %fd78, %fd78, %fd31;
-BB16_3:
+BB18_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB16_1;
+ @%p3 bra BB18_1;
-BB16_4:
+BB18_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
@@ -2746,130 +2891,130 @@ BB16_4:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB16_8;
+ @%p4 bra BB18_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB16_7;
+ @%p5 bra BB18_7;
ld.shared.f64 %fd32, [%rd1+4096];
min.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
-BB16_7:
+BB18_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB16_8:
+BB18_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB16_12;
+ @%p6 bra BB18_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB16_11;
+ @%p7 bra BB18_11;
ld.shared.f64 %fd33, [%rd1+2048];
min.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
-BB16_11:
+BB18_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB16_12:
+BB18_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB16_16;
+ @%p8 bra BB18_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB16_15;
+ @%p9 bra BB18_15;
ld.shared.f64 %fd34, [%rd1+1024];
min.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
-BB16_15:
+BB18_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB16_16:
+BB18_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB16_20;
+ @%p10 bra BB18_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB16_19;
+ @%p11 bra BB18_19;
ld.shared.f64 %fd35, [%rd1+512];
min.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
-BB16_19:
+BB18_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB16_20:
+BB18_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB16_33;
+ @%p12 bra BB18_33;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB16_23;
+ @%p13 bra BB18_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
min.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
-BB16_23:
+BB18_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB16_25;
+ @%p14 bra BB18_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
min.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
-BB16_25:
+BB18_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB16_27;
+ @%p15 bra BB18_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
min.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
-BB16_27:
+BB18_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB16_29;
+ @%p16 bra BB18_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
min.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
-BB16_29:
+BB18_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB16_31;
+ @%p17 bra BB18_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
min.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
-BB16_31:
+BB18_31:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB16_33;
+ @%p18 bra BB18_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
min.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
-BB16_33:
+BB18_33:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB16_35;
+ @%p19 bra BB18_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
@@ -2877,7 +3022,7 @@ BB16_33:
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
-BB16_35:
+BB18_35:
ret;
}
@@ -2901,17 +3046,17 @@ BB16_35:
ld.param.u32 %r4, [reduce_row_min_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
- @%p1 bra BB17_35;
+ @%p1 bra BB19_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd72, 0d7FEFFFFFFFFFFFFF;
mov.f64 %fd73, %fd72;
setp.ge.u32 %p2, %r38, %r4;
- @%p2 bra BB17_4;
+ @%p2 bra BB19_4;
cvta.to.global.u64 %rd3, %rd1;
-BB17_3:
+BB19_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
@@ -2921,9 +3066,9 @@ BB17_3:
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd72, %fd73;
- @%p3 bra BB17_3;
+ @%p3 bra BB19_3;
-BB17_4:
+BB19_4:
mov.f64 %fd70, %fd72;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
@@ -2933,130 +3078,130 @@ BB17_4:
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
- @%p4 bra BB17_8;
+ @%p4 bra BB19_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd71, %fd70;
- @%p5 bra BB17_7;
+ @%p5 bra BB19_7;
ld.shared.f64 %fd29, [%rd8+4096];
min.f64 %fd71, %fd70, %fd29;
st.shared.f64 [%rd8], %fd71;
-BB17_7:
+BB19_7:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB17_8:
+BB19_8:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p6, %r11, 512;
- @%p6 bra BB17_12;
+ @%p6 bra BB19_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd69, %fd68;
- @%p7 bra BB17_11;
+ @%p7 bra BB19_11;
ld.shared.f64 %fd30, [%rd8+2048];
min.f64 %fd69, %fd68, %fd30;
st.shared.f64 [%rd8], %fd69;
-BB17_11:
+BB19_11:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB17_12:
+BB19_12:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p8, %r11, 256;
- @%p8 bra BB17_16;
+ @%p8 bra BB19_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd67, %fd66;
- @%p9 bra BB17_15;
+ @%p9 bra BB19_15;
ld.shared.f64 %fd31, [%rd8+1024];
min.f64 %fd67, %fd66, %fd31;
st.shared.f64 [%rd8], %fd67;
-BB17_15:
+BB19_15:
mov.f64 %fd66, %fd67;
bar.sync 0;
-BB17_16:
+BB19_16:
mov.f64 %fd64, %fd66;
setp.lt.u32 %p10, %r11, 128;
- @%p10 bra BB17_20;
+ @%p10 bra BB19_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd65, %fd64;
- @%p11 bra BB17_19;
+ @%p11 bra BB19_19;
ld.shared.f64 %fd32, [%rd8+512];
min.f64 %fd65, %fd64, %fd32;
st.shared.f64 [%rd8], %fd65;
-BB17_19:
+BB19_19:
mov.f64 %fd64, %fd65;
bar.sync 0;
-BB17_20:
+BB19_20:
mov.f64 %fd63, %fd64;
setp.gt.u32 %p12, %r10, 31;
- @%p12 bra BB17_33;
+ @%p12 bra BB19_33;
setp.lt.u32 %p13, %r11, 64;
- @%p13 bra BB17_23;
+ @%p13 bra BB19_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
min.f64 %fd63, %fd63, %fd33;
st.volatile.shared.f64 [%rd8], %fd63;
-BB17_23:
+BB19_23:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p14, %r11, 32;
- @%p14 bra BB17_25;
+ @%p14 bra BB19_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
min.f64 %fd62, %fd62, %fd34;
st.volatile.shared.f64 [%rd8], %fd62;
-BB17_25:
+BB19_25:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p15, %r11, 16;
- @%p15 bra BB17_27;
+ @%p15 bra BB19_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
min.f64 %fd61, %fd61, %fd35;
st.volatile.shared.f64 [%rd8], %fd61;
-BB17_27:
+BB19_27:
mov.f64 %fd60, %fd61;
setp.lt.u32 %p16, %r11, 8;
- @%p16 bra BB17_29;
+ @%p16 bra BB19_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
min.f64 %fd60, %fd60, %fd36;
st.volatile.shared.f64 [%rd8], %fd60;
-BB17_29:
+BB19_29:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p17, %r11, 4;
- @%p17 bra BB17_31;
+ @%p17 bra BB19_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
min.f64 %fd59, %fd59, %fd37;
st.volatile.shared.f64 [%rd8], %fd59;
-BB17_31:
+BB19_31:
setp.lt.u32 %p18, %r11, 2;
- @%p18 bra BB17_33;
+ @%p18 bra BB19_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
min.f64 %fd39, %fd59, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
-BB17_33:
+BB19_33:
setp.ne.s32 %p19, %r10, 0;
- @%p19 bra BB17_35;
+ @%p19 bra BB19_35;
ld.shared.f64 %fd40, [sdata];
cvta.to.global.u64 %rd39, %rd2;
@@ -3064,7 +3209,7 @@ BB17_33:
add.s64 %rd41, %rd39, %rd40;
st.global.f64 [%rd41], %fd40;
-BB17_35:
+BB19_35:
ret;
}
@@ -3091,18 +3236,18 @@ BB17_35:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB18_5;
+ @%p1 bra BB20_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd8, 0d7FEFFFFFFFFFFFFF;
mov.f64 %fd9, %fd8;
setp.ge.u32 %p2, %r1, %r2;
- @%p2 bra BB18_4;
+ @%p2 bra BB20_4;
mov.u32 %r10, %r1;
-BB18_3:
+BB20_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
@@ -3112,15 +3257,15 @@ BB18_3:
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd8, %fd9;
- @%p3 bra BB18_3;
+ @%p3 bra BB20_3;
-BB18_4:
+BB20_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
-BB18_5:
+BB20_5:
ret;
}
@@ -3148,9 +3293,9 @@ BB18_5:
mov.f64 %fd76, 0d3FF0000000000000;
mov.f64 %fd77, %fd76;
setp.ge.u32 %p1, %r32, %r5;
- @%p1 bra BB19_4;
+ @%p1 bra BB21_4;
-BB19_1:
+BB21_1:
mov.f64 %fd1, %fd77;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r32, 8;
@@ -3159,23 +3304,23 @@ BB19_1:
mul.f64 %fd78, %fd1, %fd30;
add.s32 %r3, %r32, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB19_3;
+ @%p2 bra BB21_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd31, [%rd9];
mul.f64 %fd78, %fd78, %fd31;
-BB19_3:
+BB21_3:
mov.f64 %fd77, %fd78;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r32, %r12, %r13, %r32;
setp.lt.u32 %p3, %r32, %r5;
mov.f64 %fd76, %fd77;
- @%p3 bra BB19_1;
+ @%p3 bra BB21_1;
-BB19_4:
+BB21_4:
mov.f64 %fd74, %fd76;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
@@ -3183,130 +3328,130 @@ BB19_4:
st.shared.f64 [%rd1], %fd74;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
- @%p4 bra BB19_8;
+ @%p4 bra BB21_8;
setp.gt.u32 %p5, %r6, 511;
mov.f64 %fd75, %fd74;
- @%p5 bra BB19_7;
+ @%p5 bra BB21_7;
ld.shared.f64 %fd32, [%rd1+4096];
mul.f64 %fd75, %fd74, %fd32;
st.shared.f64 [%rd1], %fd75;
-BB19_7:
+BB21_7:
mov.f64 %fd74, %fd75;
bar.sync 0;
-BB19_8:
+BB21_8:
mov.f64 %fd72, %fd74;
setp.lt.u32 %p6, %r9, 512;
- @%p6 bra BB19_12;
+ @%p6 bra BB21_12;
setp.gt.u32 %p7, %r6, 255;
mov.f64 %fd73, %fd72;
- @%p7 bra BB19_11;
+ @%p7 bra BB21_11;
ld.shared.f64 %fd33, [%rd1+2048];
mul.f64 %fd73, %fd72, %fd33;
st.shared.f64 [%rd1], %fd73;
-BB19_11:
+BB21_11:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB19_12:
+BB21_12:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p8, %r9, 256;
- @%p8 bra BB19_16;
+ @%p8 bra BB21_16;
setp.gt.u32 %p9, %r6, 127;
mov.f64 %fd71, %fd70;
- @%p9 bra BB19_15;
+ @%p9 bra BB21_15;
ld.shared.f64 %fd34, [%rd1+1024];
mul.f64 %fd71, %fd70, %fd34;
st.shared.f64 [%rd1], %fd71;
-BB19_15:
+BB21_15:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB19_16:
+BB21_16:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p10, %r9, 128;
- @%p10 bra BB19_20;
+ @%p10 bra BB21_20;
setp.gt.u32 %p11, %r6, 63;
mov.f64 %fd69, %fd68;
- @%p11 bra BB19_19;
+ @%p11 bra BB21_19;
ld.shared.f64 %fd35, [%rd1+512];
mul.f64 %fd69, %fd68, %fd35;
st.shared.f64 [%rd1], %fd69;
-BB19_19:
+BB21_19:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB19_20:
+BB21_20:
mov.f64 %fd67, %fd68;
setp.gt.u32 %p12, %r6, 31;
- @%p12 bra BB19_33;
+ @%p12 bra BB21_33;
setp.lt.u32 %p13, %r9, 64;
- @%p13 bra BB19_23;
+ @%p13 bra BB21_23;
ld.volatile.shared.f64 %fd36, [%rd1+256];
mul.f64 %fd67, %fd67, %fd36;
st.volatile.shared.f64 [%rd1], %fd67;
-BB19_23:
+BB21_23:
mov.f64 %fd66, %fd67;
setp.lt.u32 %p14, %r9, 32;
- @%p14 bra BB19_25;
+ @%p14 bra BB21_25;
ld.volatile.shared.f64 %fd37, [%rd1+128];
mul.f64 %fd66, %fd66, %fd37;
st.volatile.shared.f64 [%rd1], %fd66;
-BB19_25:
+BB21_25:
mov.f64 %fd65, %fd66;
setp.lt.u32 %p15, %r9, 16;
- @%p15 bra BB19_27;
+ @%p15 bra BB21_27;
ld.volatile.shared.f64 %fd38, [%rd1+64];
mul.f64 %fd65, %fd65, %fd38;
st.volatile.shared.f64 [%rd1], %fd65;
-BB19_27:
+BB21_27:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p16, %r9, 8;
- @%p16 bra BB19_29;
+ @%p16 bra BB21_29;
ld.volatile.shared.f64 %fd39, [%rd1+32];
mul.f64 %fd64, %fd64, %fd39;
st.volatile.shared.f64 [%rd1], %fd64;
-BB19_29:
+BB21_29:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p17, %r9, 4;
- @%p17 bra BB19_31;
+ @%p17 bra BB21_31;
ld.volatile.shared.f64 %fd40, [%rd1+16];
mul.f64 %fd63, %fd63, %fd40;
st.volatile.shared.f64 [%rd1], %fd63;
-BB19_31:
+BB21_31:
setp.lt.u32 %p18, %r9, 2;
- @%p18 bra BB19_33;
+ @%p18 bra BB21_33;
ld.volatile.shared.f64 %fd41, [%rd1+8];
mul.f64 %fd42, %fd63, %fd41;
st.volatile.shared.f64 [%rd1], %fd42;
-BB19_33:
+BB21_33:
setp.ne.s32 %p19, %r6, 0;
- @%p19 bra BB19_35;
+ @%p19 bra BB21_35;
ld.shared.f64 %fd43, [sdata];
cvta.to.global.u64 %rd12, %rd3;
@@ -3314,7 +3459,7 @@ BB19_33:
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd43;
-BB19_35:
+BB21_35:
ret;
}
@@ -3338,17 +3483,17 @@ BB19_35:
ld.param.u32 %r4, [reduce_row_mean_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
- @%p1 bra BB20_35;
+ @%p1 bra BB22_35;
mov.u32 %r38, %tid.x;
mov.f64 %fd74, 0d0000000000000000;
mov.f64 %fd75, %fd74;
setp.ge.u32 %p2, %r38, %r4;
- @%p2 bra BB20_4;
+ @%p2 bra BB22_4;
cvta.to.global.u64 %rd3, %rd1;
-BB20_3:
+BB22_3:
mad.lo.s32 %r8, %r6, %r4, %r38;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
@@ -3358,9 +3503,9 @@ BB20_3:
add.s32 %r38, %r9, %r38;
setp.lt.u32 %p3, %r38, %r4;
mov.f64 %fd74, %fd75;
- @%p3 bra BB20_3;
+ @%p3 bra BB22_3;
-BB20_4:
+BB22_4:
mov.f64 %fd72, %fd74;
mov.u32 %r10, %tid.x;
mul.wide.u32 %rd6, %r10, 8;
@@ -3370,130 +3515,130 @@ BB20_4:
bar.sync 0;
mov.u32 %r11, %ntid.x;
setp.lt.u32 %p4, %r11, 1024;
- @%p4 bra BB20_8;
+ @%p4 bra BB22_8;
setp.gt.u32 %p5, %r10, 511;
mov.f64 %fd73, %fd72;
- @%p5 bra BB20_7;
+ @%p5 bra BB22_7;
ld.shared.f64 %fd29, [%rd8+4096];
add.f64 %fd73, %fd72, %fd29;
st.shared.f64 [%rd8], %fd73;
-BB20_7:
+BB22_7:
mov.f64 %fd72, %fd73;
bar.sync 0;
-BB20_8:
+BB22_8:
mov.f64 %fd70, %fd72;
setp.lt.u32 %p6, %r11, 512;
- @%p6 bra BB20_12;
+ @%p6 bra BB22_12;
setp.gt.u32 %p7, %r10, 255;
mov.f64 %fd71, %fd70;
- @%p7 bra BB20_11;
+ @%p7 bra BB22_11;
ld.shared.f64 %fd30, [%rd8+2048];
add.f64 %fd71, %fd70, %fd30;
st.shared.f64 [%rd8], %fd71;
-BB20_11:
+BB22_11:
mov.f64 %fd70, %fd71;
bar.sync 0;
-BB20_12:
+BB22_12:
mov.f64 %fd68, %fd70;
setp.lt.u32 %p8, %r11, 256;
- @%p8 bra BB20_16;
+ @%p8 bra BB22_16;
setp.gt.u32 %p9, %r10, 127;
mov.f64 %fd69, %fd68;
- @%p9 bra BB20_15;
+ @%p9 bra BB22_15;
ld.shared.f64 %fd31, [%rd8+1024];
add.f64 %fd69, %fd68, %fd31;
st.shared.f64 [%rd8], %fd69;
-BB20_15:
+BB22_15:
mov.f64 %fd68, %fd69;
bar.sync 0;
-BB20_16:
+BB22_16:
mov.f64 %fd66, %fd68;
setp.lt.u32 %p10, %r11, 128;
- @%p10 bra BB20_20;
+ @%p10 bra BB22_20;
setp.gt.u32 %p11, %r10, 63;
mov.f64 %fd67, %fd66;
- @%p11 bra BB20_19;
+ @%p11 bra BB22_19;
ld.shared.f64 %fd32, [%rd8+512];
add.f64 %fd67, %fd66, %fd32;
st.shared.f64 [%rd8], %fd67;
-BB20_19:
+BB22_19:
mov.f64 %fd66, %fd67;
bar.sync 0;
-BB20_20:
+BB22_20:
mov.f64 %fd65, %fd66;
setp.gt.u32 %p12, %r10, 31;
- @%p12 bra BB20_33;
+ @%p12 bra BB22_33;
setp.lt.u32 %p13, %r11, 64;
- @%p13 bra BB20_23;
+ @%p13 bra BB22_23;
ld.volatile.shared.f64 %fd33, [%rd8+256];
add.f64 %fd65, %fd65, %fd33;
st.volatile.shared.f64 [%rd8], %fd65;
-BB20_23:
+BB22_23:
mov.f64 %fd64, %fd65;
setp.lt.u32 %p14, %r11, 32;
- @%p14 bra BB20_25;
+ @%p14 bra BB22_25;
ld.volatile.shared.f64 %fd34, [%rd8+128];
add.f64 %fd64, %fd64, %fd34;
st.volatile.shared.f64 [%rd8], %fd64;
-BB20_25:
+BB22_25:
mov.f64 %fd63, %fd64;
setp.lt.u32 %p15, %r11, 16;
- @%p15 bra BB20_27;
+ @%p15 bra BB22_27;
ld.volatile.shared.f64 %fd35, [%rd8+64];
add.f64 %fd63, %fd63, %fd35;
st.volatile.shared.f64 [%rd8], %fd63;
-BB20_27:
+BB22_27:
mov.f64 %fd62, %fd63;
setp.lt.u32 %p16, %r11, 8;
- @%p16 bra BB20_29;
+ @%p16 bra BB22_29;
ld.volatile.shared.f64 %fd36, [%rd8+32];
add.f64 %fd62, %fd62, %fd36;
st.volatile.shared.f64 [%rd8], %fd62;
-BB20_29:
+BB22_29:
mov.f64 %fd61, %fd62;
setp.lt.u32 %p17, %r11, 4;
- @%p17 bra BB20_31;
+ @%p17 bra BB22_31;
ld.volatile.shared.f64 %fd37, [%rd8+16];
add.f64 %fd61, %fd61, %fd37;
st.volatile.shared.f64 [%rd8], %fd61;
-BB20_31:
+BB22_31:
setp.lt.u32 %p18, %r11, 2;
- @%p18 bra BB20_33;
+ @%p18 bra BB22_33;
ld.volatile.shared.f64 %fd38, [%rd8+8];
add.f64 %fd39, %fd61, %fd38;
st.volatile.shared.f64 [%rd8], %fd39;
-BB20_33:
+BB22_33:
setp.ne.s32 %p19, %r10, 0;
- @%p19 bra BB20_35;
+ @%p19 bra BB22_35;
ld.shared.f64 %fd40, [sdata];
cvt.u64.u32 %rd39, %r4;
@@ -3504,7 +3649,7 @@ BB20_33:
add.s64 %rd42, %rd40, %rd41;
st.global.f64 [%rd42], %fd42;
-BB20_35:
+BB22_35:
ret;
}
@@ -3531,18 +3676,18 @@ BB20_35:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB21_5;
+ @%p1 bra BB23_5;
cvta.to.global.u64 %rd1, %rd2;
mul.lo.s32 %r2, %r6, %r5;
mov.f64 %fd10, 0d0000000000000000;
mov.f64 %fd11, %fd10;
setp.ge.u32 %p2, %r1, %r2;
- @%p2 bra BB21_4;
+ @%p2 bra BB23_4;
mov.u32 %r10, %r1;
-BB21_3:
+BB23_3:
mov.u32 %r3, %r10;
mul.wide.u32 %rd4, %r3, 8;
add.s64 %rd5, %rd1, %rd4;
@@ -3552,9 +3697,9 @@ BB21_3:
setp.lt.u32 %p3, %r4, %r2;
mov.u32 %r10, %r4;
mov.f64 %fd10, %fd11;
- @%p3 bra BB21_3;
+ @%p3 bra BB23_3;
-BB21_4:
+BB23_4:
cvta.to.global.u64 %rd6, %rd3;
cvt.u64.u32 %rd7, %r5;
cvt.rn.f64.s64 %fd7, %rd7;
@@ -3563,7 +3708,7 @@ BB21_4:
add.s64 %rd9, %rd6, %rd8;
st.global.f64 [%rd9], %fd8;
-BB21_5:
+BB23_5:
ret;
}
@@ -3589,7 +3734,7 @@ BB21_5:
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.u32 %p1, %r1, %r5;
- @%p1 bra BB22_5;
+ @%p1 bra BB24_5;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
@@ -3649,13 +3794,13 @@ BB21_5:
mov.b32 %f2, %r11;
abs.f32 %f1, %f2;
setp.lt.f32 %p2, %f1, 0f4086232B;
- @%p2 bra BB22_4;
+ @%p2 bra BB24_4;
setp.lt.f64 %p3, %fd1, 0d0000000000000000;
add.f64 %fd37, %fd1, 0d7FF0000000000000;
selp.f64 %fd40, 0d0000000000000000, %fd37, %p3;
setp.geu.f32 %p4, %f1, 0f40874800;
- @%p4 bra BB22_4;
+ @%p4 bra BB24_4;
shr.u32 %r12, %r2, 31;
add.s32 %r13, %r2, %r12;
@@ -3670,13 +3815,13 @@ BB21_5:
mov.b64 %fd39, {%r20, %r19};
mul.f64 %fd40, %fd38, %fd39;
-BB22_4:
+BB24_4:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd40;
-BB22_5:
+BB24_5:
ret;
}
@@ -3701,7 +3846,7 @@ BB22_5:
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
- @%p1 bra BB23_2;
+ @%p1 bra BB25_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
@@ -3712,7 +3857,7 @@ BB22_5:
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd2;
-BB23_2:
+BB25_2:
ret;
}
@@ -3737,7 +3882,7 @@ BB23_2:
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
- @%p1 bra BB24_4;
+ @%p1 bra BB26_4;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
@@ -3746,7 +3891,7 @@ BB23_2:
ld.global.f64 %fd9, [%rd6];
abs.f64 %fd2, %fd9;
setp.ge.f64 %p2, %fd2, 0d4330000000000000;
- @%p2 bra BB24_3;
+ @%p2 bra BB26_3;
add.f64 %fd5, %fd2, 0d3FE0000000000000;
cvt.rzi.f64.f64 %fd6, %fd5;
@@ -3768,7 +3913,7 @@ BB23_2:
or.b32 %r10, %r7, %r9;
mov.b64 %fd9, {%r6, %r10};
-BB24_3:
+BB26_3:
cvta.to.global.u64 %rd7, %rd3;
cvt.rzi.s64.f64 %rd8, %fd9;
cvt.rn.f64.s64 %fd8, %rd8;
@@ -3776,7 +3921,7 @@ BB24_3:
add.s64 %rd10, %rd7, %rd9;
st.global.f64 [%rd10], %fd8;
-BB24_4:
+BB26_4:
ret;
}
@@ -3801,7 +3946,7 @@ BB24_4:
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
- @%p1 bra BB25_2;
+ @%p1 bra BB27_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
@@ -3812,7 +3957,7 @@ BB24_4:
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd2;
-BB25_2:
+BB27_2:
ret;
}
@@ -3838,7 +3983,7 @@ BB25_2:
mov.u32 %r15, %tid.x;
mad.lo.s32 %r1, %r14, %r13, %r15;
setp.ge.u32 %p1, %r1, %r12;
- @%p1 bra BB26_9;
+ @%p1 bra BB28_9;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
@@ -3855,7 +4000,7 @@ BB25_2:
}
mov.u32 %r31, -1023;
setp.gt.s32 %p2, %r29, 1048575;
- @%p2 bra BB26_3;
+ @%p2 bra BB28_3;
mul.f64 %fd56, %fd56, 0d4350000000000000;
{
@@ -3868,20 +4013,20 @@ BB25_2:
}
mov.u32 %r31, -1077;
-BB26_3:
+BB28_3:
add.s32 %r18, %r29, -1;
setp.lt.u32 %p3, %r18, 2146435071;
- @%p3 bra BB26_5;
- bra.uni BB26_4;
+ @%p3 bra BB28_5;
+ bra.uni BB28_4;
-BB26_5:
+BB28_5:
shr.u32 %r20, %r29, 20;
add.s32 %r32, %r31, %r20;
and.b32 %r21, %r29, -2146435073;
or.b32 %r22, %r21, 1072693248;
mov.b64 %fd57, {%r30, %r22};
setp.lt.s32 %p5, %r22, 1073127583;
- @%p5 bra BB26_7;
+ @%p5 bra BB28_7;
{
.reg .b32 %temp;
@@ -3895,7 +4040,7 @@ BB26_5:
mov.b64 %fd57, {%r23, %r25};
add.s32 %r32, %r32, 1;
-BB26_7:
+BB28_7:
add.f64 %fd13, %fd57, 0d3FF0000000000000;
// inline asm
rcp.approx.ftz.f64 %fd12,%fd13;
@@ -3946,9 +4091,9 @@ BB26_7:
mov.f64 %fd54, 0d3C7ABC9E3B39803F;
fma.rn.f64 %fd55, %fd47, %fd54, %fd53;
add.f64 %fd58, %fd49, %fd55;
- bra.uni BB26_8;
+ bra.uni BB28_8;
-BB26_4:
+BB28_4:
mov.f64 %fd10, 0d7FF0000000000000;
fma.rn.f64 %fd11, %fd56, %fd10, %fd10;
{
@@ -3959,13 +4104,13 @@ BB26_4:
setp.eq.f32 %p4, %f1, 0f00000000;
selp.f64 %fd58, 0dFFF0000000000000, %fd11, %p4;
-BB26_8:
+BB28_8:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd58;
-BB26_9:
+BB28_9:
ret;
}
@@ -3990,7 +4135,7 @@ BB26_9:
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
- @%p1 bra BB27_2;
+ @%p1 bra BB29_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
@@ -4001,7 +4146,7 @@ BB26_9:
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd2;
-BB27_2:
+BB29_2:
ret;
}
@@ -4026,7 +4171,7 @@ BB27_2:
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
- @%p1 bra BB28_2;
+ @%p1 bra BB30_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
@@ -4037,7 +4182,7 @@ BB27_2:
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd2;
-BB28_2:
+BB30_2:
ret;
}
@@ -4048,7 +4193,7 @@ BB28_2:
.param .u32 matrix_sin_param_2
)
{
- .local .align 4 .b8 __local_depot29[4];
+ .local .align 4 .b8 __local_depot31[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<7>;
@@ -4057,7 +4202,7 @@ BB28_2:
.reg .b64 %rd<17>;
- mov.u64 %rd16, __local_depot29;
+ mov.u64 %rd16, __local_depot31;
cvta.local.u64 %SP, %rd16;
ld.param.u64 %rd3, [matrix_sin_param_0];
ld.param.u64 %rd4, [matrix_sin_param_1];
@@ -4069,7 +4214,7 @@ BB28_2:
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r6, %r7, %r8;
setp.ge.u32 %p1, %r1, %r5;
- @%p1 bra BB29_11;
+ @%p1 bra BB31_11;
cvta.to.global.u64 %rd6, %rd3;
cvt.s64.s32 %rd2, %r1;
@@ -4082,19 +4227,19 @@ BB28_2:
}
and.b32 %r10, %r9, 2147483647;
setp.ne.s32 %p2, %r10, 2146435072;
- @%p2 bra BB29_4;
+ @%p2 bra BB31_4;
{
.reg .b32 %temp;
mov.b64 {%r11, %temp}, %fd38;
}
setp.ne.s32 %p3, %r11, 0;
- @%p3 bra BB29_4;
+ @%p3 bra BB31_4;
mov.f64 %fd14, 0d0000000000000000;
mul.rn.f64 %fd38, %fd38, %fd14;
-BB29_4:
+BB31_4:
mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883;
cvt.rni.s32.f64 %r17, %fd15;
st.local.u32 [%rd1], %r17;
@@ -4112,7 +4257,7 @@ BB29_4:
}
and.b32 %r13, %r12, 2145386496;
setp.lt.u32 %p4, %r13, 1105199104;
- @%p4 bra BB29_6;
+ @%p4 bra BB31_6;
// Callseq Start 3
{
@@ -4135,7 +4280,7 @@ BB29_4:
}// Callseq End 3
ld.local.u32 %r17, [%rd1];
-BB29_6:
+BB31_6:
and.b32 %r14, %r17, 1;
shl.b32 %r15, %r14, 3;
setp.eq.s32 %p5, %r14, 0;
@@ -4157,27 +4302,27 @@ BB29_6:
ld.const.f64 %fd34, [%rd12+48];
fma.rn.f64 %fd8, %fd33, %fd7, %fd34;
fma.rn.f64 %fd40, %fd8, %fd39, %fd39;
- @%p5 bra BB29_8;
+ @%p5 bra BB31_8;
mov.f64 %fd35, 0d3FF0000000000000;
fma.rn.f64 %fd40, %fd8, %fd7, %fd35;
-BB29_8:
+BB31_8:
and.b32 %r16, %r17, 2;
setp.eq.s32 %p6, %r16, 0;
- @%p6 bra BB29_10;
+ @%p6 bra BB31_10;
mov.f64 %fd36, 0d0000000000000000;
mov.f64 %fd37, 0dBFF0000000000000;
fma.rn.f64 %fd40, %fd40, %fd37, %fd36;
-BB29_10:
+BB31_10:
cvta.to.global.u64 %rd13, %rd4;
shl.b64 %rd14, %rd2, 3;
add.s64 %rd15, %rd13, %rd14;
st.global.f64 [%rd15], %fd40;
-BB29_11:
+BB31_11:
ret;
}
@@ -4188,7 +4333,7 @@ BB29_11:
.param .u32 matrix_cos_param_2
)
{
- .local .align 4 .b8 __local_depot30[4];
+ .local .align 4 .b8 __local_depot32[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<7>;
@@ -4197,7 +4342,7 @@ BB29_11:
.reg .b64 %rd<17>;
- mov.u64 %rd16, __local_depot30;
+ mov.u64 %rd16, __local_depot32;
cvta.local.u64 %SP, %rd16;
ld.param.u64 %rd3, [matrix_cos_param_0];
ld.param.u64 %rd4, [matrix_cos_param_1];
@@ -4209,7 +4354,7 @@ BB29_11:
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB30_11;
+ @%p1 bra BB32_11;
cvta.to.global.u64 %rd6, %rd3;
cvt.s64.s32 %rd2, %r1;
@@ -4222,19 +4367,19 @@ BB29_11:
}
and.b32 %r11, %r10, 2147483647;
setp.ne.s32 %p2, %r11, 2146435072;
- @%p2 bra BB30_4;
+ @%p2 bra BB32_4;
{
.reg .b32 %temp;
mov.b64 {%r12, %temp}, %fd38;
}
setp.ne.s32 %p3, %r12, 0;
- @%p3 bra BB30_4;
+ @%p3 bra BB32_4;
mov.f64 %fd14, 0d0000000000000000;
mul.rn.f64 %fd38, %fd38, %fd14;
-BB30_4:
+BB32_4:
mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883;
cvt.rni.s32.f64 %r18, %fd15;
st.local.u32 [%rd1], %r18;
@@ -4252,7 +4397,7 @@ BB30_4:
}
and.b32 %r14, %r13, 2145386496;
setp.lt.u32 %p4, %r14, 1105199104;
- @%p4 bra BB30_6;
+ @%p4 bra BB32_6;
// Callseq Start 4
{
@@ -4275,7 +4420,7 @@ BB30_4:
}// Callseq End 4
ld.local.u32 %r18, [%rd1];
-BB30_6:
+BB32_6:
add.s32 %r5, %r18, 1;
and.b32 %r15, %r5, 1;
shl.b32 %r16, %r15, 3;
@@ -4298,27 +4443,27 @@ BB30_6:
ld.const.f64 %fd34, [%rd12+48];
fma.rn.f64 %fd8, %fd33, %fd7, %fd34;
fma.rn.f64 %fd40, %fd8, %fd39, %fd39;
- @%p5 bra BB30_8;
+ @%p5 bra BB32_8;
mov.f64 %fd35, 0d3FF0000000000000;
fma.rn.f64 %fd40, %fd8, %fd7, %fd35;
-BB30_8:
+BB32_8:
and.b32 %r17, %r5, 2;
setp.eq.s32 %p6, %r17, 0;
- @%p6 bra BB30_10;
+ @%p6 bra BB32_10;
mov.f64 %fd36, 0d0000000000000000;
mov.f64 %fd37, 0dBFF0000000000000;
fma.rn.f64 %fd40, %fd40, %fd37, %fd36;
-BB30_10:
+BB32_10:
cvta.to.global.u64 %rd13, %rd4;
shl.b64 %rd14, %rd2, 3;
add.s64 %rd15, %rd13, %rd14;
st.global.f64 [%rd15], %fd40;
-BB30_11:
+BB32_11:
ret;
}
@@ -4329,7 +4474,7 @@ BB30_11:
.param .u32 matrix_tan_param_2
)
{
- .local .align 4 .b8 __local_depot31[4];
+ .local .align 4 .b8 __local_depot33[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<6>;
@@ -4338,7 +4483,7 @@ BB30_11:
.reg .b64 %rd<14>;
- mov.u64 %rd13, __local_depot31;
+ mov.u64 %rd13, __local_depot33;
cvta.local.u64 %SP, %rd13;
ld.param.u64 %rd3, [matrix_tan_param_0];
ld.param.u64 %rd4, [matrix_tan_param_1];
@@ -4350,7 +4495,7 @@ BB30_11:
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r6, %r7, %r8;
setp.ge.u32 %p1, %r1, %r5;
- @%p1 bra BB31_9;
+ @%p1 bra BB33_9;
cvta.to.global.u64 %rd6, %rd3;
cvt.s64.s32 %rd2, %r1;
@@ -4363,19 +4508,19 @@ BB30_11:
}
and.b32 %r10, %r9, 2147483647;
setp.ne.s32 %p2, %r10, 2146435072;
- @%p2 bra BB31_4;
+ @%p2 bra BB33_4;
{
.reg .b32 %temp;
mov.b64 {%r11, %temp}, %fd63;
}
setp.ne.s32 %p3, %r11, 0;
- @%p3 bra BB31_4;
+ @%p3 bra BB33_4;
mov.f64 %fd11, 0d0000000000000000;
mul.rn.f64 %fd63, %fd63, %fd11;
-BB31_4:
+BB33_4:
mul.f64 %fd12, %fd63, 0d3FE45F306DC9C883;
cvt.rni.s32.f64 %r15, %fd12;
st.local.u32 [%rd1], %r15;
@@ -4393,7 +4538,7 @@ BB31_4:
}
and.b32 %r13, %r12, 2145386496;
setp.lt.u32 %p4, %r13, 1105199104;
- @%p4 bra BB31_6;
+ @%p4 bra BB33_6;
// Callseq Start 5
{
@@ -4416,7 +4561,7 @@ BB31_4:
}// Callseq End 5
ld.local.u32 %r15, [%rd1];
-BB31_6:
+BB33_6:
mul.f64 %fd20, %fd64, %fd64;
mov.f64 %fd21, 0dBEF9757C5B27EBB1;
mov.f64 %fd22, 0d3EE48DAC2799BCB9;
@@ -4451,10 +4596,10 @@ BB31_6:
fma.rn.f64 %fd65, %fd7, %fd64, %fd64;
and.b32 %r14, %r15, 1;
setp.eq.b32 %p5, %r14, 1;
- @!%p5 bra BB31_8;
- bra.uni BB31_7;
+ @!%p5 bra BB33_8;
+ bra.uni BB33_7;
-BB31_7:
+BB33_7:
sub.f64 %fd52, %fd65, %fd64;
neg.f64 %fd53, %fd52;
fma.rn.f64 %fd54, %fd7, %fd64, %fd53;
@@ -4471,13 +4616,13 @@ BB31_7:
fma.rn.f64 %fd62, %fd60, %fd54, %fd61;
fma.rn.f64 %fd65, %fd62, %fd60, %fd60;
-BB31_8:
+BB33_8:
cvta.to.global.u64 %rd10, %rd4;
shl.b64 %rd11, %rd2, 3;
add.s64 %rd12, %rd10, %rd11;
st.global.f64 [%rd12], %fd65;
-BB31_9:
+BB33_9:
ret;
}
@@ -4503,7 +4648,7 @@ BB31_9:
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
setp.ge.u32 %p1, %r1, %r3;
- @%p1 bra BB32_5;
+ @%p1 bra BB34_5;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
@@ -4517,10 +4662,10 @@ BB31_9:
mov.b32 %f1, %r2;
abs.f32 %f2, %f1;
setp.lt.f32 %p2, %f2, 0f3FE26666;
- @%p2 bra BB32_3;
- bra.uni BB32_2;
+ @%p2 bra BB34_3;
+ bra.uni BB34_2;
-BB32_3:
+BB34_3:
mul.f64 %fd55, %fd1, %fd1;
mov.f64 %fd56, 0dBFB3823B180754AF;
mov.f64 %fd57, 0d3FB0066BDC1895E9;
@@ -4549,9 +4694,9 @@ BB32_3:
fma.rn.f64 %fd80, %fd78, %fd55, %fd79;
mul.f64 %fd81, %fd55, %fd80;
fma.rn.f64 %fd82, %fd81, %fd1, %fd1;
- bra.uni BB32_4;
+ bra.uni BB34_4;
-BB32_2:
+BB34_2:
abs.f64 %fd7, %fd1;
mov.f64 %fd8, 0d3FE0000000000000;
mov.f64 %fd9, 0dBFE0000000000000;
@@ -4631,13 +4776,13 @@ BB32_2:
or.b32 %r14, %r12, %r13;
mov.b64 %fd82, {%r11, %r14};
-BB32_4:
+BB34_4:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd82;
-BB32_5:
+BB34_5:
ret;
}
@@ -4662,7 +4807,7 @@ BB32_5:
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
setp.ge.u32 %p1, %r1, %r4;
- @%p1 bra BB33_14;
+ @%p1 bra BB35_14;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
@@ -4679,10 +4824,10 @@ BB32_5:
mov.b64 {%temp, %r8}, %fd1;
}
setp.lt.s32 %p2, %r8, 1071801958;
- @%p2 bra BB33_9;
- bra.uni BB33_2;
+ @%p2 bra BB35_9;
+ bra.uni BB35_2;
-BB33_9:
+BB35_9:
mul.f64 %fd62, %fd1, %fd1;
mov.f64 %fd63, 0dBFB3823B180754AF;
mov.f64 %fd64, 0d3FB0066BDC1895E9;
@@ -4712,14 +4857,14 @@ BB33_9:
mul.f64 %fd88, %fd62, %fd87;
fma.rn.f64 %fd10, %fd88, %fd1, %fd1;
setp.lt.s32 %p6, %r2, 0;
- @%p6 bra BB33_11;
+ @%p6 bra BB35_11;
mov.f64 %fd89, 0dBC91A62633145C07;
add.rn.f64 %fd90, %fd10, %fd89;
neg.f64 %fd93, %fd90;
- bra.uni BB33_12;
+ bra.uni BB35_12;
-BB33_2:
+BB35_2:
mov.f64 %fd19, 0d3FF0000000000000;
sub.f64 %fd2, %fd19, %fd1;
{
@@ -4755,7 +4900,7 @@ BB33_2:
fma.rn.f64 %fd28, %fd24, %fd25, %fd18;
fma.rn.f64 %fd3, %fd28, %fd27, %fd24;
setp.lt.s32 %p3, %r3, 1;
- @%p3 bra BB33_4;
+ @%p3 bra BB35_4;
{
.reg .b32 %temp;
@@ -4794,31 +4939,31 @@ BB33_2:
fma.rn.f64 %fd54, %fd52, %fd2, %fd53;
mul.f64 %fd55, %fd2, %fd54;
fma.rn.f64 %fd94, %fd55, %fd29, %fd29;
- bra.uni BB33_5;
+ bra.uni BB35_5;
-BB33_11:
+BB35_11:
mov.f64 %fd91, 0d3C91A62633145C07;
add.rn.f64 %fd93, %fd10, %fd91;
-BB33_12:
+BB35_12:
mov.f64 %fd92, 0d3FF921FB54442D18;
add.rn.f64 %fd94, %fd92, %fd93;
- bra.uni BB33_13;
+ bra.uni BB35_13;
-BB33_4:
+BB35_4:
mov.f64 %fd56, 0d0000000000000000;
mul.rn.f64 %fd94, %fd1, %fd56;
-BB33_5:
+BB35_5:
setp.gt.s32 %p4, %r3, -1;
- @%p4 bra BB33_7;
+ @%p4 bra BB35_7;
mov.f64 %fd57, 0d7FF0000000000000;
mul.rn.f64 %fd94, %fd94, %fd57;
-BB33_7:
+BB35_7:
setp.gt.s32 %p5, %r2, -1;
- @%p5 bra BB33_13;
+ @%p5 bra BB35_13;
mov.f64 %fd58, 0dBCA1A62633145C07;
add.rn.f64 %fd59, %fd94, %fd58;
@@ -4826,13 +4971,13 @@ BB33_7:
mov.f64 %fd61, 0d400921FB54442D18;
add.rn.f64 %fd94, %fd61, %fd60;
-BB33_13:
+BB35_13:
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd94;
-BB33_14:
+BB35_14:
ret;
}
@@ -4857,7 +5002,7 @@ BB33_14:
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
- @%p1 bra BB34_4;
+ @%p1 bra BB36_4;
cvta.to.global.u64 %rd4, %rd2;
cvt.s64.s32 %rd1, %r1;
@@ -4867,7 +5012,7 @@ BB33_14:
abs.f64 %fd2, %fd1;
setp.leu.f64 %p2, %fd2, 0d3FF0000000000000;
mov.f64 %fd56, %fd2;
- @%p2 bra BB34_3;
+ @%p2 bra BB36_3;
// inline asm
rcp.approx.ftz.f64 %fd5,%fd2;
@@ -4881,7 +5026,7 @@ BB33_14:
selp.f64 %fd3, 0d0000000000000000, %fd11, %p3;
mov.f64 %fd56, %fd3;
-BB34_3:
+BB36_3:
mov.f64 %fd4, %fd56;
cvta.to.global.u64 %rd7, %rd3;
mul.f64 %fd12, %fd4, %fd4;
@@ -4947,7 +5092,7 @@ BB34_3:
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd55;
-BB34_4:
+BB36_4:
ret;
}
@@ -4972,7 +5117,7 @@ BB34_4:
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
- @%p1 bra BB35_4;
+ @%p1 bra BB37_4;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r1, 8;
@@ -4981,15 +5126,15 @@ BB34_4:
setp.eq.f64 %p2, %fd1, 0d0000000000000000;
cvta.to.global.u64 %rd7, %rd3;
add.s64 %rd1, %rd7, %rd5;
- @%p2 bra BB35_3;
- bra.uni BB35_2;
+ @%p2 bra BB37_3;
+ bra.uni BB37_2;
-BB35_3:
+BB37_3:
mov.u64 %rd8, 0;
st.global.u64 [%rd1], %rd8;
- bra.uni BB35_4;
+ bra.uni BB37_4;
-BB35_2:
+BB37_2:
{
.reg .b32 %temp;
mov.b64 {%temp, %r6}, %fd1;
@@ -5009,7 +5154,7 @@ BB35_2:
mov.b64 %fd3, {%r11, %r10};
st.global.f64 [%rd1], %fd3;
-BB35_4:
+BB37_4:
ret;
}
@@ -5018,7 +5163,7 @@ BB35_4:
.param .b64 __internal_trig_reduction_slowpathd_param_1
)
{
- .local .align 8 .b8 __local_depot36[40];
+ .local .align 8 .b8 __local_depot38[40];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<9>;
@@ -5027,7 +5172,7 @@ BB35_4:
.reg .b64 %rd<101>;
- mov.u64 %rd100, __local_depot36;
+ mov.u64 %rd100, __local_depot38;
cvta.local.u64 %SP, %rd100;
ld.param.f64 %fd4, [__internal_trig_reduction_slowpathd_param_0];
ld.param.u64 %rd37, [__internal_trig_reduction_slowpathd_param_1];
@@ -5041,7 +5186,7 @@ BB35_4:
shr.u32 %r3, %r1, 20;
bfe.u32 %r4, %r1, 20, 11;
setp.eq.s32 %p1, %r4, 2047;
- @%p1 bra BB36_13;
+ @%p1 bra BB38_13;
add.s32 %r16, %r4, -1024;
shr.u32 %r17, %r16, 6;
@@ -5054,7 +5199,7 @@ BB35_4:
setp.gt.s32 %p2, %r5, %r6;
mov.u64 %rd94, 0;
mov.u64 %rd93, %rd1;
- @%p2 bra BB36_4;
+ @%p2 bra BB38_4;
mov.b64 %rd41, %fd4;
shl.b64 %rd42, %rd41, 11;
@@ -5073,7 +5218,7 @@ BB35_4:
mov.u64 %rd91, %rd1;
mov.u32 %r39, %r7;
-BB36_3:
+BB38_3:
.pragma "nounroll";
mov.u32 %r8, %r39;
mov.u64 %rd7, %rd91;
@@ -5108,15 +5253,15 @@ BB36_3:
setp.lt.s32 %p3, %r9, %r6;
mov.u64 %rd91, %rd13;
mov.u32 %r39, %r9;
- @%p3 bra BB36_3;
+ @%p3 bra BB38_3;
-BB36_4:
+BB38_4:
st.local.u64 [%rd93], %rd94;
ld.local.u64 %rd95, [%rd1+16];
ld.local.u64 %rd96, [%rd1+24];
and.b32 %r10, %r3, 63;
setp.eq.s32 %p4, %r10, 0;
- @%p4 bra BB36_6;
+ @%p4 bra BB38_6;
mov.u32 %r27, 64;
sub.s32 %r28, %r27, %r10;
@@ -5128,7 +5273,7 @@ BB36_4:
shr.u64 %rd56, %rd55, %r28;
or.b64 %rd95, %rd56, %rd54;
-BB36_6:
+BB38_6:
cvta.to.local.u64 %rd57, %rd37;
shr.u64 %rd58, %rd96, 62;
cvt.u32.u64 %r29, %rd58;
@@ -5145,7 +5290,7 @@ BB36_6:
selp.b32 %r34, %r32, %r33, %p5;
st.local.u32 [%rd57], %r34;
setp.eq.s32 %p6, %r31, 0;
- @%p6 bra BB36_8;
+ @%p6 bra BB38_8;
mov.u64 %rd65, 0;
// inline asm
@@ -5165,10 +5310,10 @@ BB36_6:
// inline asm
xor.b32 %r40, %r40, -2147483648;
-BB36_8:
+BB38_8:
clz.b64 %r41, %rd98;
setp.eq.s32 %p7, %r41, 0;
- @%p7 bra BB36_10;
+ @%p7 bra BB38_10;
shl.b64 %rd68, %rd98, %r41;
mov.u32 %r35, 64;
@@ -5176,7 +5321,7 @@ BB36_8:
shr.u64 %rd69, %rd97, %r36;
or.b64 %rd98, %rd69, %rd68;
-BB36_10:
+BB38_10:
mov.u64 %rd73, -3958705157555305931;
// inline asm
{
@@ -5197,7 +5342,7 @@ BB36_10:
}
// inline asm
setp.lt.s64 %p8, %rd99, 1;
- @%p8 bra BB36_12;
+ @%p8 bra BB38_12;
// inline asm
{
@@ -5216,7 +5361,7 @@ BB36_10:
// inline asm
add.s32 %r41, %r41, 1;
-BB36_12:
+BB38_12:
cvt.u64.u32 %rd80, %r40;
shl.b64 %rd81, %rd80, 32;
mov.u32 %r37, 1022;
@@ -5231,7 +5376,7 @@ BB36_12:
or.b64 %rd89, %rd88, %rd81;
mov.b64 %fd4, %rd89;
-BB36_13:
+BB38_13:
st.param.f64 [func_retval0+0], %fd4;
ret;
}
@@ -5259,7 +5404,7 @@ BB36_13:
}
shr.u32 %r50, %r49, 20;
setp.ne.s32 %p1, %r50, 0;
- @%p1 bra BB37_2;
+ @%p1 bra BB39_2;
mul.f64 %fd14, %fd12, 0d4350000000000000;
{
@@ -5273,13 +5418,13 @@ BB36_13:
shr.u32 %r16, %r49, 20;
add.s32 %r50, %r16, -54;
-BB37_2:
+BB39_2:
add.s32 %r51, %r50, -1023;
and.b32 %r17, %r49, -2146435073;
or.b32 %r18, %r17, 1072693248;
mov.b64 %fd132, {%r48, %r18};
setp.lt.u32 %p2, %r18, 1073127583;
- @%p2 bra BB37_4;
+ @%p2 bra BB39_4;
{
.reg .b32 %temp;
@@ -5293,7 +5438,7 @@ BB37_2:
mov.b64 %fd132, {%r19, %r21};
add.s32 %r51, %r50, -1022;
-BB37_4:
+BB39_4:
add.f64 %fd16, %fd132, 0d3FF0000000000000;
// inline asm
rcp.approx.ftz.f64 %fd15,%fd16;
@@ -5458,13 +5603,13 @@ BB37_4:
mov.b32 %f2, %r35;
abs.f32 %f1, %f2;
setp.lt.f32 %p4, %f1, 0f4086232B;
- @%p4 bra BB37_7;
+ @%p4 bra BB39_7;
setp.lt.f64 %p5, %fd4, 0d0000000000000000;
add.f64 %fd129, %fd4, 0d7FF0000000000000;
selp.f64 %fd133, 0d0000000000000000, %fd129, %p5;
setp.geu.f32 %p6, %f1, 0f40874800;
- @%p6 bra BB37_7;
+ @%p6 bra BB39_7;
shr.u32 %r36, %r13, 31;
add.s32 %r37, %r13, %r36;
@@ -5479,26 +5624,26 @@ BB37_4:
mov.b64 %fd131, {%r44, %r43};
mul.f64 %fd133, %fd130, %fd131;
-BB37_7:
+BB39_7:
{
.reg .b32 %temp;
mov.b64 {%temp, %r45}, %fd133;
}
and.b32 %r46, %r45, 2147483647;
setp.ne.s32 %p7, %r46, 2146435072;
- @%p7 bra BB37_9;
+ @%p7 bra BB39_9;
{
.reg .b32 %temp;
mov.b64 {%r47, %temp}, %fd133;
}
setp.eq.s32 %p8, %r47, 0;
- @%p8 bra BB37_10;
+ @%p8 bra BB39_10;
-BB37_9:
+BB39_9:
fma.rn.f64 %fd133, %fd133, %fd5, %fd133;
-BB37_10:
+BB39_10:
st.param.f64 [func_retval0+0], %fd133;
ret;
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/hops/BinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/BinaryOp.java b/src/main/java/org/apache/sysml/hops/BinaryOp.java
index 6175621..9155203 100644
--- a/src/main/java/org/apache/sysml/hops/BinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/BinaryOp.java
@@ -23,24 +23,26 @@ import org.apache.sysml.api.DMLScript;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.hops.rewrite.HopRewriteUtils;
import org.apache.sysml.lops.Aggregate;
+import org.apache.sysml.lops.Append;
+import org.apache.sysml.lops.AppendG;
import org.apache.sysml.lops.AppendGAlignedSP;
import org.apache.sysml.lops.AppendM;
-import org.apache.sysml.lops.AppendCP;
-import org.apache.sysml.lops.AppendG;
import org.apache.sysml.lops.AppendR;
import org.apache.sysml.lops.Binary;
-import org.apache.sysml.lops.BinaryScalar;
import org.apache.sysml.lops.BinaryM;
+import org.apache.sysml.lops.BinaryScalar;
import org.apache.sysml.lops.BinaryUAggChain;
import org.apache.sysml.lops.CentralMoment;
import org.apache.sysml.lops.CoVariance;
import org.apache.sysml.lops.CombineBinary;
+import org.apache.sysml.lops.CombineBinary.OperationTypes;
import org.apache.sysml.lops.CombineUnary;
import org.apache.sysml.lops.ConvolutionTransform;
import org.apache.sysml.lops.Data;
import org.apache.sysml.lops.DataPartition;
import org.apache.sysml.lops.Group;
import org.apache.sysml.lops.Lop;
+import org.apache.sysml.lops.LopProperties.ExecType;
import org.apache.sysml.lops.LopsException;
import org.apache.sysml.lops.PartialAggregate;
import org.apache.sysml.lops.PickByCount;
@@ -48,8 +50,6 @@ import org.apache.sysml.lops.RepMat;
import org.apache.sysml.lops.SortKeys;
import org.apache.sysml.lops.Unary;
import org.apache.sysml.lops.UnaryCP;
-import org.apache.sysml.lops.CombineBinary.OperationTypes;
-import org.apache.sysml.lops.LopProperties.ExecType;
import org.apache.sysml.parser.Expression.DataType;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat;
@@ -527,15 +527,20 @@ public class BinaryOp extends Hop
}
else //CP
{
+ if (DMLScript.USE_ACCELERATOR && dt1 == DataType.MATRIX && (DMLScript.FORCE_ACCELERATOR
+ || getMemEstimate() < GPUContextPool.initialGPUMemBudget())) {
+ et = ExecType.GPU;
+ }
+
Lop offset = createOffsetLop( getInput().get(0), cbind ); //offset 1st input
- append = new AppendCP(getInput().get(0).constructLops(), getInput().get(1).constructLops(), offset, getDataType(), getValueType(), cbind);
+ append = new Append(getInput().get(0).constructLops(), getInput().get(1).constructLops(), offset, getDataType(), getValueType(), cbind, et);
append.getOutputParameters().setDimensions(rlen, clen, getRowsInBlock(), getColsInBlock(), getNnz());
}
}
else //SCALAR-STRING and SCALAR-STRING (always CP)
{
- append = new AppendCP(getInput().get(0).constructLops(), getInput().get(1).constructLops(),
- Data.createLiteralLop(ValueType.INT, "-1"), getDataType(), getValueType(), cbind);
+ append = new Append(getInput().get(0).constructLops(), getInput().get(1).constructLops(),
+ Data.createLiteralLop(ValueType.INT, "-1"), getDataType(), getValueType(), cbind, ExecType.CP);
append.getOutputParameters().setDimensions(0,0,-1,-1,-1);
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/lops/Append.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/Append.java b/src/main/java/org/apache/sysml/lops/Append.java
new file mode 100644
index 0000000..e224e51
--- /dev/null
+++ b/src/main/java/org/apache/sysml/lops/Append.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.lops;
+
+import org.apache.sysml.lops.LopProperties.ExecLocation;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.lops.compile.JobType;
+import org.apache.sysml.parser.Expression.*;
+
+
+public class Append extends Lop
+{
+ public static final String OPCODE = "append";
+
+ private boolean _cbind = true;
+ private ExecType _et;
+
+ public Append(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt, boolean cbind, ExecType et)
+ {
+ super(Lop.Type.Append, dt, vt);
+ _et = et;
+ init(input1, input2, input3, dt, vt);
+
+ _cbind = cbind;
+ }
+
+ public void init(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt)
+ {
+ addInput(input1);
+ input1.addOutput(this);
+
+ addInput(input2);
+ input2.addOutput(this);
+
+ addInput(input3);
+ input3.addOutput(this);
+
+ boolean breaksAlignment = false;
+ boolean aligner = false;
+ boolean definesMRJob = false;
+
+ lps.addCompatibility(JobType.INVALID);
+ lps.setProperties( inputs, _et, ExecLocation.ControlProgram, breaksAlignment, aligner, definesMRJob );
+ }
+
+ @Override
+ public String toString() {
+
+ return " Append: ";
+ }
+
+ //called when append executes in CP
+ public String getInstructions(String input1, String input2, String input3, String output)
+ throws LopsException
+ {
+ StringBuilder sb = new StringBuilder();
+ sb.append( getExecType() );
+ sb.append( OPERAND_DELIMITOR );
+ sb.append( "append" );
+
+ sb.append( OPERAND_DELIMITOR );
+ sb.append( getInputs().get(0).prepInputOperand(input1));
+
+ sb.append( OPERAND_DELIMITOR );
+ sb.append( getInputs().get(1).prepInputOperand(input2));
+
+ sb.append( OPERAND_DELIMITOR );
+ sb.append( getInputs().get(2).prepScalarInputOperand(getExecType()));
+
+ sb.append( OPERAND_DELIMITOR );
+ sb.append( prepOutputOperand(output) );
+
+ sb.append( OPERAND_DELIMITOR );
+ sb.append( _cbind );
+
+ return sb.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/lops/AppendCP.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/AppendCP.java b/src/main/java/org/apache/sysml/lops/AppendCP.java
deleted file mode 100644
index e76f21e..0000000
--- a/src/main/java/org/apache/sysml/lops/AppendCP.java
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.lops;
-
-import org.apache.sysml.lops.LopProperties.ExecLocation;
-import org.apache.sysml.lops.LopProperties.ExecType;
-import org.apache.sysml.lops.compile.JobType;
-import org.apache.sysml.parser.Expression.*;
-
-
-public class AppendCP extends Lop
-{
- public static final String OPCODE = "append";
-
- private boolean _cbind = true;
-
- public AppendCP(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt, boolean cbind)
- {
- super(Lop.Type.Append, dt, vt);
- init(input1, input2, input3, dt, vt);
-
- _cbind = cbind;
- }
-
- public void init(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt)
- {
- addInput(input1);
- input1.addOutput(this);
-
- addInput(input2);
- input2.addOutput(this);
-
- addInput(input3);
- input3.addOutput(this);
-
- boolean breaksAlignment = false;
- boolean aligner = false;
- boolean definesMRJob = false;
-
- lps.addCompatibility(JobType.INVALID);
- lps.setProperties( inputs, ExecType.CP, ExecLocation.ControlProgram, breaksAlignment, aligner, definesMRJob );
- }
-
- @Override
- public String toString() {
-
- return " AppendCP: ";
- }
-
- //called when append executes in CP
- public String getInstructions(String input1, String input2, String input3, String output)
- throws LopsException
- {
- StringBuilder sb = new StringBuilder();
- sb.append( getExecType() );
- sb.append( OPERAND_DELIMITOR );
- sb.append( "append" );
-
- sb.append( OPERAND_DELIMITOR );
- sb.append( getInputs().get(0).prepInputOperand(input1));
-
- sb.append( OPERAND_DELIMITOR );
- sb.append( getInputs().get(1).prepInputOperand(input2));
-
- sb.append( OPERAND_DELIMITOR );
- sb.append( getInputs().get(2).prepScalarInputOperand(getExecType()));
-
- sb.append( OPERAND_DELIMITOR );
- sb.append( prepOutputOperand(output) );
-
- sb.append( OPERAND_DELIMITOR );
- sb.append( _cbind );
-
- return sb.toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index 7088c50..e755fa0 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -22,7 +22,7 @@ package org.apache.sysml.runtime.instructions;
import java.util.HashMap;
-import org.apache.sysml.lops.AppendCP;
+import org.apache.sysml.lops.Append;
import org.apache.sysml.lops.DataGen;
import org.apache.sysml.lops.LopProperties.ExecType;
import org.apache.sysml.lops.UnaryCP;
@@ -239,7 +239,7 @@ public class CPInstructionParser extends InstructionParser
// User-defined function Opcodes
String2CPInstructionType.put( "extfunct" , CPINSTRUCTION_TYPE.External);
- String2CPInstructionType.put( AppendCP.OPCODE, CPINSTRUCTION_TYPE.Append);
+ String2CPInstructionType.put( Append.OPCODE, CPINSTRUCTION_TYPE.Append);
// data generation opcodes
String2CPInstructionType.put( DataGen.RAND_OPCODE , CPINSTRUCTION_TYPE.Rand);
http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 17b1578..36f57b4 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -33,6 +33,7 @@ import org.apache.sysml.runtime.instructions.gpu.MMTSJGPUInstruction;
import org.apache.sysml.runtime.instructions.gpu.RelationalBinaryGPUInstruction;
import org.apache.sysml.runtime.instructions.gpu.ReorgGPUInstruction;
import org.apache.sysml.runtime.instructions.gpu.AggregateUnaryGPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.MatrixAppendGPUInstruction;
public class GPUInstructionParser extends InstructionParser
{
@@ -52,12 +53,15 @@ public class GPUInstructionParser extends InstructionParser
String2GPUInstructionType.put( "bias_multiply", GPUINSTRUCTION_TYPE.Convolution);
// Matrix Multiply Operators
- String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary);
- String2GPUInstructionType.put( "tsmm", GPUINSTRUCTION_TYPE.MMTSJ);
+ String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary);
+ String2GPUInstructionType.put( "tsmm", GPUINSTRUCTION_TYPE.MMTSJ);
// Reorg/Transpose
- String2GPUInstructionType.put( "r'", GPUINSTRUCTION_TYPE.Reorg);
-
+ String2GPUInstructionType.put( "r'", GPUINSTRUCTION_TYPE.Reorg);
+
+ // Matrix Manipulation
+ String2GPUInstructionType.put( "append", GPUINSTRUCTION_TYPE.Append);
+
// Binary Cellwise
String2GPUInstructionType.put( "+", GPUINSTRUCTION_TYPE.ArithmeticBinary);
String2GPUInstructionType.put( "-", GPUINSTRUCTION_TYPE.ArithmeticBinary);
@@ -161,7 +165,10 @@ public class GPUInstructionParser extends InstructionParser
case BuiltinBinary:
return BuiltinBinaryGPUInstruction.parseInstruction(str);
-
+
+ case Append:
+ return MatrixAppendGPUInstruction.parseInstruction(str);
+
case Convolution:
return ConvolutionGPUInstruction.parseInstruction(str);
http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java
index 7529b05..e1c163d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java
@@ -43,7 +43,7 @@ public abstract class BuiltinUnaryGPUInstruction extends GPUInstruction {
_gputype = GPUINSTRUCTION_TYPE.BuiltinUnary;
this._arity = _arity;
_input = in;
- _output = out;
+ _output = out;
}
public int getArity() {
http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index 7f981eb..a5388cb 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -39,6 +39,7 @@ public abstract class GPUInstruction extends Instruction
Convolution,
MMTSJ,
Reorg,
+ Append,
ArithmeticBinary,
BuiltinUnary,
BuiltinBinary,
@@ -101,6 +102,8 @@ public abstract class GPUInstruction extends Instruction
public final static String MISC_TIMER_ACOS_KERNEL = "acosk"; // time spent in the acos kernel
public final static String MISC_TIMER_ATAN_KERNEL = "atank"; // time spent in the atan kernel
public final static String MISC_TIMER_SIGN_KERNEL = "signk"; // time spent in the sign kernel
+ public final static String MISC_TIMER_CBIND_KERNEL = "cbindk"; // time spent in the cbind kernel
+ public final static String MISC_TIMER_RBIND_KERNEL = "rbindk"; // time spent in the rbind kernel
public final static String MISC_TIMER_DAXPY_MV_KERNEL = "daxpymv";// time spent in the daxpy_matrix_vector kernel
public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL = "u2lk"; // time spent in the copy_u2l_dense kernel
http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixAppendGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixAppendGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixAppendGPUInstruction.java
new file mode 100644
index 0000000..7671d7d
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixAppendGPUInstruction.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.instructions.gpu;
+
+import org.apache.sysml.parser.Expression;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.functionobjects.OffsetColumnIndex;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.instructions.cp.AppendCPInstruction;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.cp.FrameAppendCPInstruction;
+import org.apache.sysml.runtime.instructions.cp.MatrixAppendCPInstruction;
+import org.apache.sysml.runtime.instructions.cp.ScalarAppendCPInstruction;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.operators.Operator;
+import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
+import org.apache.sysml.utils.GPUStatistics;
+
+/**
+ * Implements the cbind and rbind functions for matrices
+ */
+public class MatrixAppendGPUInstruction extends GPUInstruction {
+
+ CPOperand output;
+ CPOperand input1, input2;
+ AppendCPInstruction.AppendType type;
+
+ public MatrixAppendGPUInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand out, AppendCPInstruction.AppendType type, String opcode, String istr) {
+ super(op, opcode, istr);
+ this.output = out;
+ this.input1 = in1;
+ this.input2 = in2;
+ this.type = type;
+ }
+
+ public static MatrixAppendGPUInstruction parseInstruction ( String str )
+ throws DMLRuntimeException
+ {
+ String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
+ InstructionUtils.checkNumFields (parts, 5);
+
+ String opcode = parts[0];
+ CPOperand in1 = new CPOperand(parts[1]);
+ CPOperand in2 = new CPOperand(parts[2]);
+ CPOperand in3 = new CPOperand(parts[3]);
+ CPOperand out = new CPOperand(parts[4]);
+ boolean cbind = Boolean.parseBoolean(parts[5]);
+
+ AppendCPInstruction.AppendType type = (in1.getDataType()!= Expression.DataType.MATRIX && in1.getDataType()!= Expression.DataType.FRAME) ?
+ AppendCPInstruction.AppendType.STRING : cbind ? AppendCPInstruction.AppendType.CBIND : AppendCPInstruction.AppendType.RBIND;
+
+ if (in1.getDataType()!= Expression.DataType.MATRIX || in2.getDataType()!= Expression.DataType.MATRIX){
+ throw new DMLRuntimeException("GPU : Error in internal state - Append was called on data other than matrices");
+ }
+
+ if(!opcode.equalsIgnoreCase("append"))
+ throw new DMLRuntimeException("Unknown opcode while parsing a AppendCPInstruction: " + str);
+
+ Operator op = new ReorgOperator(OffsetColumnIndex.getOffsetColumnIndexFnObject(-1));
+ return new MatrixAppendGPUInstruction(op, in1, in2, out, type, opcode, str);
+ }
+
+ @Override
+ public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
+ GPUStatistics.incrementNoOfExecutedGPUInst();
+
+ String opcode = getOpcode();
+ MatrixObject mat1 = getMatrixInputForGPUInstruction(ec, input1.getName());
+ MatrixObject mat2 = getMatrixInputForGPUInstruction(ec, input2.getName());
+
+ if(type == AppendCPInstruction.AppendType.CBIND) {
+ LibMatrixCUDA.cbind(ec, ec.getGPUContext(0), getExtendedOpcode(), mat1, mat2, output.getName());
+ } else if (type == AppendCPInstruction.AppendType.RBIND ) {
+ LibMatrixCUDA.rbind(ec, ec.getGPUContext(0), getExtendedOpcode(), mat1, mat2, output.getName());
+ } else {
+ throw new DMLRuntimeException("Unsupported GPU operator:" + opcode);
+ }
+ ec.releaseMatrixInputForGPUInstruction(input1.getName());
+ ec.releaseMatrixInputForGPUInstruction(input2.getName());
+ ec.releaseMatrixOutputForGPUInstruction(output.getName());
+ }
+}