You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/07/13 21:32:13 UTC

[2/2] systemml git commit: [SYSTEML-1758] added cbind and rbind for GPU

[SYSTEML-1758] added cbind and rbind for GPU

Closes #570


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4e47b5e1
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4e47b5e1
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4e47b5e1

Branch: refs/heads/master
Commit: 4e47b5e10ff1abdf1ef53c2b1b0d80614ec8e416
Parents: cd1ae5b
Author: Nakul Jindal <na...@gmail.com>
Authored: Thu Jul 13 14:31:47 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Thu Jul 13 14:31:47 2017 -0700

----------------------------------------------------------------------
 src/main/cpp/kernels/SystemML.cu                |   78 +-
 src/main/cpp/kernels/SystemML.ptx               | 1043 ++++++++++--------
 .../java/org/apache/sysml/hops/BinaryOp.java    |   21 +-
 src/main/java/org/apache/sysml/lops/Append.java |   95 ++
 .../java/org/apache/sysml/lops/AppendCP.java    |   93 --
 .../instructions/CPInstructionParser.java       |    4 +-
 .../instructions/GPUInstructionParser.java      |   17 +-
 .../gpu/BuiltinUnaryGPUInstruction.java         |    2 +-
 .../instructions/gpu/GPUInstruction.java        |    3 +
 .../gpu/MatrixAppendGPUInstruction.java         |  102 ++
 .../runtime/matrix/data/LibMatrixCUDA.java      |  109 +-
 .../org/apache/sysml/test/gpu/AppendTest.java   |  108 ++
 .../test/integration/gpu/ZPackageSuite.java     |    2 +
 13 files changed, 1099 insertions(+), 578 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 3098282..297269f 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -216,7 +216,7 @@ __global__ void matrix_matrix_cellwise_op(double* A, double* B, double* C,
 			bIndex = iy; // rlen == 1
 		C[outIndex] = binaryOp(A[aIndex], B[bIndex], op);
 		//printf("C[%d] = A[%d](%f) B[%d](%f) (%d %d)\n", outIndex, aIndex, A[aIndex], bIndex,  B[bIndex], (ix+1), (iy+1));
-    __syncthreads();
+	__syncthreads();
 	}
 }
 
@@ -238,9 +238,9 @@ __global__ void matrix_scalar_op(double* A, double scalar, double* C, int size,
 			C[index] = binaryOp(scalar, A[index], op);
 		} else {
 			C[index] = binaryOp(A[index], scalar, op);
-    }
+		}
 	}
-  __syncthreads();
+	__syncthreads();
 }
 
 
@@ -259,6 +259,78 @@ __global__ void fill(double* A, double scalar, int lenA) {
 }
 
 /**
+ * Appends Matrix B to the right side of Matrix A into a new matrix C
+ *         | 1 2 3 4 |   | 8 8 8 |     | 1 2 3 4 8 8 8 |
+ * cbind ( | 9 8 7 6 | , | 7 7 7 | ) = | 9 8 7 6 7 7 7 |
+ *         | 4 3 2 1 |   | 9 9 9 |     | 4 3 2 1 9 9 9 |
+ * @param A      input matrix A allocated on the GPU
+ * @param B      input matrix B allocated on the GPU
+ * @param C      input matrix C allocated on the GPU
+ * @param rowsA  rows in A
+ * @param colsA  columns in A
+ * @param rowsB  rows in B
+ * @param colsB  columns in B
+ */
+extern "C"
+__global__ void cbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) {
+	int ix = blockIdx.x * blockDim.x + threadIdx.x;
+	int iy = blockIdx.y * blockDim.y + threadIdx.y;
+
+	int colsC = colsA + colsB;
+	int rowsC = rowsA;
+
+	// Copy an element of A into C into the appropriate location
+	if (ix < rowsA && iy < colsA) {
+		double elemA = A[ix * colsA + iy];
+		C[ix * colsC + iy] = elemA;
+	}
+
+	// Copy an element of B into C into the appropriate location
+	if (ix < rowsB && iy < colsB) {
+		double elemB = B[ix * colsB + iy];
+		C[ix * colsC + (iy + colsA)] = elemB;
+	}
+}
+
+
+/**
+ * Appends Matrix B to the bottom of Matrix A into a new matrix C
+ *         | 2 3 4 |   | 8 8 8 |     | 2 3 4 |
+ * rbind ( | 8 7 6 | , | 7 7 7 | ) = | 8 7 6 |
+ *         | 3 2 1 |                 | 3 2 1 |
+                                     | 8 8 8 |
+                                     | 7 7 7 |
+ * @param A      input matrix A allocated on the GPU
+ * @param B      input matrix B allocated on the GPU
+ * @param C      input matrix C allocated on the GPU
+ * @param rowsA  rows in A
+ * @param colsA  columns in A
+ * @param rowsB  rows in B
+ * @param colsB  columns in B
+ */
+extern "C"
+__global__ void rbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) {
+	int ix = blockIdx.x * blockDim.x + threadIdx.x;
+	int iy = blockIdx.y * blockDim.y + threadIdx.y;
+
+	int rowsC = rowsA + rowsB;
+	int colsC = colsA;
+
+	// Copy an element of A into C into the appropriate location
+	if (ix < rowsA && iy < colsA) {
+		double elemA = A[ix * colsA + iy];
+		C[ix * colsC + iy] = elemA;
+	}
+
+	// Copy an element of B into C into the appropriate location
+	if (ix < rowsB && iy < colsB) {
+		double elemB = B[ix * colsB + iy];
+		C[(ix + rowsA) * colsC + iy] = elemB;
+	}
+}
+
+
+/**
  * Does a reduce operation over all elements of the array.
  * This method has been adapted from the Reduction sample in the NVIDIA CUDA Samples (v8.0)
  * and the Reduction example available through jcuda.org

http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index ab43758..6884d5b 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1813,6 +1813,151 @@ BB9_2:
 	ret;
 }
 
+	// .globl	cbind
+.visible .entry cbind(
+	.param .u64 cbind_param_0,
+	.param .u64 cbind_param_1,
+	.param .u64 cbind_param_2,
+	.param .u32 cbind_param_3,
+	.param .u32 cbind_param_4,
+	.param .u32 cbind_param_5,
+	.param .u32 cbind_param_6
+)
+{
+	.reg .pred 	%p<7>;
+	.reg .b32 	%r<19>;
+	.reg .f64 	%fd<3>;
+	.reg .b64 	%rd<15>;
+
+
+	ld.param.u64 	%rd2, [cbind_param_0];
+	ld.param.u64 	%rd3, [cbind_param_1];
+	ld.param.u64 	%rd4, [cbind_param_2];
+	ld.param.u32 	%r7, [cbind_param_3];
+	ld.param.u32 	%r4, [cbind_param_4];
+	ld.param.u32 	%r5, [cbind_param_5];
+	ld.param.u32 	%r6, [cbind_param_6];
+	cvta.to.global.u64 	%rd1, %rd4;
+	mov.u32 	%r8, %ntid.x;
+	mov.u32 	%r9, %ctaid.x;
+	mov.u32 	%r10, %tid.x;
+	mad.lo.s32 	%r1, %r8, %r9, %r10;
+	mov.u32 	%r11, %ntid.y;
+	mov.u32 	%r12, %ctaid.y;
+	mov.u32 	%r13, %tid.y;
+	mad.lo.s32 	%r2, %r11, %r12, %r13;
+	add.s32 	%r3, %r6, %r4;
+	setp.lt.s32	%p1, %r1, %r7;
+	setp.lt.s32	%p2, %r2, %r4;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB10_2;
+	bra.uni 	BB10_1;
+
+BB10_1:
+	cvta.to.global.u64 	%rd5, %rd2;
+	mad.lo.s32 	%r14, %r1, %r4, %r2;
+	mul.wide.s32 	%rd6, %r14, 8;
+	add.s64 	%rd7, %rd5, %rd6;
+	ld.global.f64 	%fd1, [%rd7];
+	mad.lo.s32 	%r15, %r1, %r3, %r2;
+	mul.wide.s32 	%rd8, %r15, 8;
+	add.s64 	%rd9, %rd1, %rd8;
+	st.global.f64 	[%rd9], %fd1;
+
+BB10_2:
+	setp.lt.s32	%p4, %r1, %r5;
+	setp.lt.s32	%p5, %r2, %r6;
+	and.pred  	%p6, %p4, %p5;
+	@!%p6 bra 	BB10_4;
+	bra.uni 	BB10_3;
+
+BB10_3:
+	cvta.to.global.u64 	%rd10, %rd3;
+	mad.lo.s32 	%r16, %r1, %r6, %r2;
+	mul.wide.s32 	%rd11, %r16, 8;
+	add.s64 	%rd12, %rd10, %rd11;
+	ld.global.f64 	%fd2, [%rd12];
+	mad.lo.s32 	%r17, %r1, %r3, %r4;
+	add.s32 	%r18, %r17, %r2;
+	mul.wide.s32 	%rd13, %r18, 8;
+	add.s64 	%rd14, %rd1, %rd13;
+	st.global.f64 	[%rd14], %fd2;
+
+BB10_4:
+	ret;
+}
+
+	// .globl	rbind
+.visible .entry rbind(
+	.param .u64 rbind_param_0,
+	.param .u64 rbind_param_1,
+	.param .u64 rbind_param_2,
+	.param .u32 rbind_param_3,
+	.param .u32 rbind_param_4,
+	.param .u32 rbind_param_5,
+	.param .u32 rbind_param_6
+)
+{
+	.reg .pred 	%p<7>;
+	.reg .b32 	%r<17>;
+	.reg .f64 	%fd<3>;
+	.reg .b64 	%rd<14>;
+
+
+	ld.param.u64 	%rd2, [rbind_param_0];
+	ld.param.u64 	%rd3, [rbind_param_1];
+	ld.param.u64 	%rd4, [rbind_param_2];
+	ld.param.u32 	%r3, [rbind_param_3];
+	ld.param.u32 	%r4, [rbind_param_4];
+	ld.param.u32 	%r5, [rbind_param_5];
+	ld.param.u32 	%r6, [rbind_param_6];
+	cvta.to.global.u64 	%rd1, %rd4;
+	mov.u32 	%r7, %ntid.x;
+	mov.u32 	%r8, %ctaid.x;
+	mov.u32 	%r9, %tid.x;
+	mad.lo.s32 	%r1, %r7, %r8, %r9;
+	mov.u32 	%r10, %ntid.y;
+	mov.u32 	%r11, %ctaid.y;
+	mov.u32 	%r12, %tid.y;
+	mad.lo.s32 	%r2, %r10, %r11, %r12;
+	setp.lt.s32	%p1, %r1, %r3;
+	setp.lt.s32	%p2, %r2, %r4;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB11_2;
+	bra.uni 	BB11_1;
+
+BB11_1:
+	cvta.to.global.u64 	%rd5, %rd2;
+	mad.lo.s32 	%r13, %r1, %r4, %r2;
+	mul.wide.s32 	%rd6, %r13, 8;
+	add.s64 	%rd7, %rd5, %rd6;
+	ld.global.f64 	%fd1, [%rd7];
+	add.s64 	%rd8, %rd1, %rd6;
+	st.global.f64 	[%rd8], %fd1;
+
+BB11_2:
+	setp.lt.s32	%p4, %r1, %r5;
+	setp.lt.s32	%p5, %r2, %r6;
+	and.pred  	%p6, %p4, %p5;
+	@!%p6 bra 	BB11_4;
+	bra.uni 	BB11_3;
+
+BB11_3:
+	cvta.to.global.u64 	%rd9, %rd3;
+	mad.lo.s32 	%r14, %r1, %r6, %r2;
+	mul.wide.s32 	%rd10, %r14, 8;
+	add.s64 	%rd11, %rd9, %rd10;
+	ld.global.f64 	%fd2, [%rd11];
+	add.s32 	%r15, %r1, %r3;
+	mad.lo.s32 	%r16, %r15, %r4, %r2;
+	mul.wide.s32 	%rd12, %r16, 8;
+	add.s64 	%rd13, %rd1, %rd12;
+	st.global.f64 	[%rd13], %fd2;
+
+BB11_4:
+	ret;
+}
+
 	// .globl	reduce_sum
 .visible .entry reduce_sum(
 	.param .u64 reduce_sum_param_0,
@@ -1837,9 +1982,9 @@ BB9_2:
 	mov.f64 	%fd76, 0d0000000000000000;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB10_4;
+	@%p1 bra 	BB12_4;
 
-BB10_1:
+BB12_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -1848,23 +1993,23 @@ BB10_1:
 	add.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB10_3;
+	@%p2 bra 	BB12_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	add.f64 	%fd78, %fd78, %fd31;
 
-BB10_3:
+BB12_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB10_1;
+	@%p3 bra 	BB12_1;
 
-BB10_4:
+BB12_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -1872,130 +2017,130 @@ BB10_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB10_8;
+	@%p4 bra 	BB12_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB10_7;
+	@%p5 bra 	BB12_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	add.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB10_7:
+BB12_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB10_8:
+BB12_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB10_12;
+	@%p6 bra 	BB12_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB10_11;
+	@%p7 bra 	BB12_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	add.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB10_11:
+BB12_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB10_12:
+BB12_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB10_16;
+	@%p8 bra 	BB12_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB10_15;
+	@%p9 bra 	BB12_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	add.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB10_15:
+BB12_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB10_16:
+BB12_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB10_20;
+	@%p10 bra 	BB12_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB10_19;
+	@%p11 bra 	BB12_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	add.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB10_19:
+BB12_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB10_20:
+BB12_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB10_33;
+	@%p12 bra 	BB12_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB10_23;
+	@%p13 bra 	BB12_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	add.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB10_23:
+BB12_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB10_25;
+	@%p14 bra 	BB12_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	add.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB10_25:
+BB12_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB10_27;
+	@%p15 bra 	BB12_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	add.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB10_27:
+BB12_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB10_29;
+	@%p16 bra 	BB12_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	add.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB10_29:
+BB12_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB10_31;
+	@%p17 bra 	BB12_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	add.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB10_31:
+BB12_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB10_33;
+	@%p18 bra 	BB12_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	add.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB10_33:
+BB12_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB10_35;
+	@%p19 bra 	BB12_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -2003,7 +2148,7 @@ BB10_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB10_35:
+BB12_35:
 	ret;
 }
 
@@ -2027,17 +2172,17 @@ BB10_35:
 	ld.param.u32 	%r4, [reduce_row_sum_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB11_35;
+	@%p1 bra 	BB13_35;
 
 	mov.u32 	%r38, %tid.x;
 	mov.f64 	%fd72, 0d0000000000000000;
 	mov.f64 	%fd73, %fd72;
 	setp.ge.u32	%p2, %r38, %r4;
-	@%p2 bra 	BB11_4;
+	@%p2 bra 	BB13_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB11_3:
+BB13_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r38;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -2047,9 +2192,9 @@ BB11_3:
 	add.s32 	%r38, %r9, %r38;
 	setp.lt.u32	%p3, %r38, %r4;
 	mov.f64 	%fd72, %fd73;
-	@%p3 bra 	BB11_3;
+	@%p3 bra 	BB13_3;
 
-BB11_4:
+BB13_4:
 	mov.f64 	%fd70, %fd72;
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
@@ -2059,130 +2204,130 @@ BB11_4:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB11_8;
+	@%p4 bra 	BB13_8;
 
 	setp.gt.u32	%p5, %r10, 511;
 	mov.f64 	%fd71, %fd70;
-	@%p5 bra 	BB11_7;
+	@%p5 bra 	BB13_7;
 
 	ld.shared.f64 	%fd29, [%rd8+4096];
 	add.f64 	%fd71, %fd70, %fd29;
 	st.shared.f64 	[%rd8], %fd71;
 
-BB11_7:
+BB13_7:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB11_8:
+BB13_8:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB11_12;
+	@%p6 bra 	BB13_12;
 
 	setp.gt.u32	%p7, %r10, 255;
 	mov.f64 	%fd69, %fd68;
-	@%p7 bra 	BB11_11;
+	@%p7 bra 	BB13_11;
 
 	ld.shared.f64 	%fd30, [%rd8+2048];
 	add.f64 	%fd69, %fd68, %fd30;
 	st.shared.f64 	[%rd8], %fd69;
 
-BB11_11:
+BB13_11:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB11_12:
+BB13_12:
 	mov.f64 	%fd66, %fd68;
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB11_16;
+	@%p8 bra 	BB13_16;
 
 	setp.gt.u32	%p9, %r10, 127;
 	mov.f64 	%fd67, %fd66;
-	@%p9 bra 	BB11_15;
+	@%p9 bra 	BB13_15;
 
 	ld.shared.f64 	%fd31, [%rd8+1024];
 	add.f64 	%fd67, %fd66, %fd31;
 	st.shared.f64 	[%rd8], %fd67;
 
-BB11_15:
+BB13_15:
 	mov.f64 	%fd66, %fd67;
 	bar.sync 	0;
 
-BB11_16:
+BB13_16:
 	mov.f64 	%fd64, %fd66;
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB11_20;
+	@%p10 bra 	BB13_20;
 
 	setp.gt.u32	%p11, %r10, 63;
 	mov.f64 	%fd65, %fd64;
-	@%p11 bra 	BB11_19;
+	@%p11 bra 	BB13_19;
 
 	ld.shared.f64 	%fd32, [%rd8+512];
 	add.f64 	%fd65, %fd64, %fd32;
 	st.shared.f64 	[%rd8], %fd65;
 
-BB11_19:
+BB13_19:
 	mov.f64 	%fd64, %fd65;
 	bar.sync 	0;
 
-BB11_20:
+BB13_20:
 	mov.f64 	%fd63, %fd64;
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB11_33;
+	@%p12 bra 	BB13_33;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB11_23;
+	@%p13 bra 	BB13_23;
 
 	ld.volatile.shared.f64 	%fd33, [%rd8+256];
 	add.f64 	%fd63, %fd63, %fd33;
 	st.volatile.shared.f64 	[%rd8], %fd63;
 
-BB11_23:
+BB13_23:
 	mov.f64 	%fd62, %fd63;
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB11_25;
+	@%p14 bra 	BB13_25;
 
 	ld.volatile.shared.f64 	%fd34, [%rd8+128];
 	add.f64 	%fd62, %fd62, %fd34;
 	st.volatile.shared.f64 	[%rd8], %fd62;
 
-BB11_25:
+BB13_25:
 	mov.f64 	%fd61, %fd62;
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB11_27;
+	@%p15 bra 	BB13_27;
 
 	ld.volatile.shared.f64 	%fd35, [%rd8+64];
 	add.f64 	%fd61, %fd61, %fd35;
 	st.volatile.shared.f64 	[%rd8], %fd61;
 
-BB11_27:
+BB13_27:
 	mov.f64 	%fd60, %fd61;
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB11_29;
+	@%p16 bra 	BB13_29;
 
 	ld.volatile.shared.f64 	%fd36, [%rd8+32];
 	add.f64 	%fd60, %fd60, %fd36;
 	st.volatile.shared.f64 	[%rd8], %fd60;
 
-BB11_29:
+BB13_29:
 	mov.f64 	%fd59, %fd60;
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB11_31;
+	@%p17 bra 	BB13_31;
 
 	ld.volatile.shared.f64 	%fd37, [%rd8+16];
 	add.f64 	%fd59, %fd59, %fd37;
 	st.volatile.shared.f64 	[%rd8], %fd59;
 
-BB11_31:
+BB13_31:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB11_33;
+	@%p18 bra 	BB13_33;
 
 	ld.volatile.shared.f64 	%fd38, [%rd8+8];
 	add.f64 	%fd39, %fd59, %fd38;
 	st.volatile.shared.f64 	[%rd8], %fd39;
 
-BB11_33:
+BB13_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB11_35;
+	@%p19 bra 	BB13_35;
 
 	ld.shared.f64 	%fd40, [sdata];
 	cvta.to.global.u64 	%rd39, %rd2;
@@ -2190,7 +2335,7 @@ BB11_33:
 	add.s64 	%rd41, %rd39, %rd40;
 	st.global.f64 	[%rd41], %fd40;
 
-BB11_35:
+BB13_35:
 	ret;
 }
 
@@ -2217,18 +2362,18 @@ BB11_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB12_5;
+	@%p1 bra 	BB14_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd8, 0d0000000000000000;
 	mov.f64 	%fd9, %fd8;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB12_4;
+	@%p2 bra 	BB14_4;
 
 	mov.u32 	%r10, %r1;
 
-BB12_3:
+BB14_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -2238,15 +2383,15 @@ BB12_3:
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
 	mov.f64 	%fd8, %fd9;
-	@%p3 bra 	BB12_3;
+	@%p3 bra 	BB14_3;
 
-BB12_4:
+BB14_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd8;
 
-BB12_5:
+BB14_5:
 	ret;
 }
 
@@ -2274,9 +2419,9 @@ BB12_5:
 	mov.f64 	%fd76, 0dFFEFFFFFFFFFFFFF;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB13_4;
+	@%p1 bra 	BB15_4;
 
-BB13_1:
+BB15_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -2285,23 +2430,23 @@ BB13_1:
 	max.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB13_3;
+	@%p2 bra 	BB15_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	max.f64 	%fd78, %fd78, %fd31;
 
-BB13_3:
+BB15_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB13_1;
+	@%p3 bra 	BB15_1;
 
-BB13_4:
+BB15_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -2309,130 +2454,130 @@ BB13_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB13_8;
+	@%p4 bra 	BB15_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB13_7;
+	@%p5 bra 	BB15_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	max.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB13_7:
+BB15_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB13_8:
+BB15_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB13_12;
+	@%p6 bra 	BB15_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB13_11;
+	@%p7 bra 	BB15_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	max.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB13_11:
+BB15_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB13_12:
+BB15_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB13_16;
+	@%p8 bra 	BB15_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB13_15;
+	@%p9 bra 	BB15_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	max.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB13_15:
+BB15_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB13_16:
+BB15_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB13_20;
+	@%p10 bra 	BB15_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB13_19;
+	@%p11 bra 	BB15_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	max.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB13_19:
+BB15_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB13_20:
+BB15_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB13_33;
+	@%p12 bra 	BB15_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB13_23;
+	@%p13 bra 	BB15_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	max.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB13_23:
+BB15_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB13_25;
+	@%p14 bra 	BB15_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	max.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB13_25:
+BB15_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB13_27;
+	@%p15 bra 	BB15_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	max.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB13_27:
+BB15_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB13_29;
+	@%p16 bra 	BB15_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	max.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB13_29:
+BB15_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB13_31;
+	@%p17 bra 	BB15_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	max.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB13_31:
+BB15_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB13_33;
+	@%p18 bra 	BB15_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	max.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB13_33:
+BB15_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB13_35;
+	@%p19 bra 	BB15_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -2440,7 +2585,7 @@ BB13_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB13_35:
+BB15_35:
 	ret;
 }
 
@@ -2464,17 +2609,17 @@ BB13_35:
 	ld.param.u32 	%r4, [reduce_row_max_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB14_35;
+	@%p1 bra 	BB16_35;
 
 	mov.u32 	%r38, %tid.x;
 	mov.f64 	%fd72, 0dFFEFFFFFFFFFFFFF;
 	mov.f64 	%fd73, %fd72;
 	setp.ge.u32	%p2, %r38, %r4;
-	@%p2 bra 	BB14_4;
+	@%p2 bra 	BB16_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB14_3:
+BB16_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r38;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -2484,9 +2629,9 @@ BB14_3:
 	add.s32 	%r38, %r9, %r38;
 	setp.lt.u32	%p3, %r38, %r4;
 	mov.f64 	%fd72, %fd73;
-	@%p3 bra 	BB14_3;
+	@%p3 bra 	BB16_3;
 
-BB14_4:
+BB16_4:
 	mov.f64 	%fd70, %fd72;
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
@@ -2496,130 +2641,130 @@ BB14_4:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB14_8;
+	@%p4 bra 	BB16_8;
 
 	setp.gt.u32	%p5, %r10, 511;
 	mov.f64 	%fd71, %fd70;
-	@%p5 bra 	BB14_7;
+	@%p5 bra 	BB16_7;
 
 	ld.shared.f64 	%fd29, [%rd8+4096];
 	max.f64 	%fd71, %fd70, %fd29;
 	st.shared.f64 	[%rd8], %fd71;
 
-BB14_7:
+BB16_7:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB14_8:
+BB16_8:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB14_12;
+	@%p6 bra 	BB16_12;
 
 	setp.gt.u32	%p7, %r10, 255;
 	mov.f64 	%fd69, %fd68;
-	@%p7 bra 	BB14_11;
+	@%p7 bra 	BB16_11;
 
 	ld.shared.f64 	%fd30, [%rd8+2048];
 	max.f64 	%fd69, %fd68, %fd30;
 	st.shared.f64 	[%rd8], %fd69;
 
-BB14_11:
+BB16_11:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB14_12:
+BB16_12:
 	mov.f64 	%fd66, %fd68;
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB14_16;
+	@%p8 bra 	BB16_16;
 
 	setp.gt.u32	%p9, %r10, 127;
 	mov.f64 	%fd67, %fd66;
-	@%p9 bra 	BB14_15;
+	@%p9 bra 	BB16_15;
 
 	ld.shared.f64 	%fd31, [%rd8+1024];
 	max.f64 	%fd67, %fd66, %fd31;
 	st.shared.f64 	[%rd8], %fd67;
 
-BB14_15:
+BB16_15:
 	mov.f64 	%fd66, %fd67;
 	bar.sync 	0;
 
-BB14_16:
+BB16_16:
 	mov.f64 	%fd64, %fd66;
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB14_20;
+	@%p10 bra 	BB16_20;
 
 	setp.gt.u32	%p11, %r10, 63;
 	mov.f64 	%fd65, %fd64;
-	@%p11 bra 	BB14_19;
+	@%p11 bra 	BB16_19;
 
 	ld.shared.f64 	%fd32, [%rd8+512];
 	max.f64 	%fd65, %fd64, %fd32;
 	st.shared.f64 	[%rd8], %fd65;
 
-BB14_19:
+BB16_19:
 	mov.f64 	%fd64, %fd65;
 	bar.sync 	0;
 
-BB14_20:
+BB16_20:
 	mov.f64 	%fd63, %fd64;
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB14_33;
+	@%p12 bra 	BB16_33;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB14_23;
+	@%p13 bra 	BB16_23;
 
 	ld.volatile.shared.f64 	%fd33, [%rd8+256];
 	max.f64 	%fd63, %fd63, %fd33;
 	st.volatile.shared.f64 	[%rd8], %fd63;
 
-BB14_23:
+BB16_23:
 	mov.f64 	%fd62, %fd63;
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB14_25;
+	@%p14 bra 	BB16_25;
 
 	ld.volatile.shared.f64 	%fd34, [%rd8+128];
 	max.f64 	%fd62, %fd62, %fd34;
 	st.volatile.shared.f64 	[%rd8], %fd62;
 
-BB14_25:
+BB16_25:
 	mov.f64 	%fd61, %fd62;
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB14_27;
+	@%p15 bra 	BB16_27;
 
 	ld.volatile.shared.f64 	%fd35, [%rd8+64];
 	max.f64 	%fd61, %fd61, %fd35;
 	st.volatile.shared.f64 	[%rd8], %fd61;
 
-BB14_27:
+BB16_27:
 	mov.f64 	%fd60, %fd61;
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB14_29;
+	@%p16 bra 	BB16_29;
 
 	ld.volatile.shared.f64 	%fd36, [%rd8+32];
 	max.f64 	%fd60, %fd60, %fd36;
 	st.volatile.shared.f64 	[%rd8], %fd60;
 
-BB14_29:
+BB16_29:
 	mov.f64 	%fd59, %fd60;
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB14_31;
+	@%p17 bra 	BB16_31;
 
 	ld.volatile.shared.f64 	%fd37, [%rd8+16];
 	max.f64 	%fd59, %fd59, %fd37;
 	st.volatile.shared.f64 	[%rd8], %fd59;
 
-BB14_31:
+BB16_31:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB14_33;
+	@%p18 bra 	BB16_33;
 
 	ld.volatile.shared.f64 	%fd38, [%rd8+8];
 	max.f64 	%fd39, %fd59, %fd38;
 	st.volatile.shared.f64 	[%rd8], %fd39;
 
-BB14_33:
+BB16_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB14_35;
+	@%p19 bra 	BB16_35;
 
 	ld.shared.f64 	%fd40, [sdata];
 	cvta.to.global.u64 	%rd39, %rd2;
@@ -2627,7 +2772,7 @@ BB14_33:
 	add.s64 	%rd41, %rd39, %rd40;
 	st.global.f64 	[%rd41], %fd40;
 
-BB14_35:
+BB16_35:
 	ret;
 }
 
@@ -2654,18 +2799,18 @@ BB14_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB15_5;
+	@%p1 bra 	BB17_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd8, 0dFFEFFFFFFFFFFFFF;
 	mov.f64 	%fd9, %fd8;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB15_4;
+	@%p2 bra 	BB17_4;
 
 	mov.u32 	%r10, %r1;
 
-BB15_3:
+BB17_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -2675,15 +2820,15 @@ BB15_3:
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
 	mov.f64 	%fd8, %fd9;
-	@%p3 bra 	BB15_3;
+	@%p3 bra 	BB17_3;
 
-BB15_4:
+BB17_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd8;
 
-BB15_5:
+BB17_5:
 	ret;
 }
 
@@ -2711,9 +2856,9 @@ BB15_5:
 	mov.f64 	%fd76, 0d7FEFFFFFFFFFFFFF;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB16_4;
+	@%p1 bra 	BB18_4;
 
-BB16_1:
+BB18_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -2722,23 +2867,23 @@ BB16_1:
 	min.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB16_3;
+	@%p2 bra 	BB18_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	min.f64 	%fd78, %fd78, %fd31;
 
-BB16_3:
+BB18_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB16_1;
+	@%p3 bra 	BB18_1;
 
-BB16_4:
+BB18_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -2746,130 +2891,130 @@ BB16_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB16_8;
+	@%p4 bra 	BB18_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB16_7;
+	@%p5 bra 	BB18_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	min.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB16_7:
+BB18_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB16_8:
+BB18_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB16_12;
+	@%p6 bra 	BB18_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB16_11;
+	@%p7 bra 	BB18_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	min.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB16_11:
+BB18_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB16_12:
+BB18_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB16_16;
+	@%p8 bra 	BB18_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB16_15;
+	@%p9 bra 	BB18_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	min.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB16_15:
+BB18_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB16_16:
+BB18_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB16_20;
+	@%p10 bra 	BB18_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB16_19;
+	@%p11 bra 	BB18_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	min.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB16_19:
+BB18_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB16_20:
+BB18_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB16_33;
+	@%p12 bra 	BB18_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB16_23;
+	@%p13 bra 	BB18_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	min.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB16_23:
+BB18_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB16_25;
+	@%p14 bra 	BB18_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	min.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB16_25:
+BB18_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB16_27;
+	@%p15 bra 	BB18_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	min.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB16_27:
+BB18_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB16_29;
+	@%p16 bra 	BB18_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	min.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB16_29:
+BB18_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB16_31;
+	@%p17 bra 	BB18_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	min.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB16_31:
+BB18_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB16_33;
+	@%p18 bra 	BB18_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	min.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB16_33:
+BB18_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB16_35;
+	@%p19 bra 	BB18_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -2877,7 +3022,7 @@ BB16_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB16_35:
+BB18_35:
 	ret;
 }
 
@@ -2901,17 +3046,17 @@ BB16_35:
 	ld.param.u32 	%r4, [reduce_row_min_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB17_35;
+	@%p1 bra 	BB19_35;
 
 	mov.u32 	%r38, %tid.x;
 	mov.f64 	%fd72, 0d7FEFFFFFFFFFFFFF;
 	mov.f64 	%fd73, %fd72;
 	setp.ge.u32	%p2, %r38, %r4;
-	@%p2 bra 	BB17_4;
+	@%p2 bra 	BB19_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB17_3:
+BB19_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r38;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -2921,9 +3066,9 @@ BB17_3:
 	add.s32 	%r38, %r9, %r38;
 	setp.lt.u32	%p3, %r38, %r4;
 	mov.f64 	%fd72, %fd73;
-	@%p3 bra 	BB17_3;
+	@%p3 bra 	BB19_3;
 
-BB17_4:
+BB19_4:
 	mov.f64 	%fd70, %fd72;
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
@@ -2933,130 +3078,130 @@ BB17_4:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB17_8;
+	@%p4 bra 	BB19_8;
 
 	setp.gt.u32	%p5, %r10, 511;
 	mov.f64 	%fd71, %fd70;
-	@%p5 bra 	BB17_7;
+	@%p5 bra 	BB19_7;
 
 	ld.shared.f64 	%fd29, [%rd8+4096];
 	min.f64 	%fd71, %fd70, %fd29;
 	st.shared.f64 	[%rd8], %fd71;
 
-BB17_7:
+BB19_7:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB17_8:
+BB19_8:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB17_12;
+	@%p6 bra 	BB19_12;
 
 	setp.gt.u32	%p7, %r10, 255;
 	mov.f64 	%fd69, %fd68;
-	@%p7 bra 	BB17_11;
+	@%p7 bra 	BB19_11;
 
 	ld.shared.f64 	%fd30, [%rd8+2048];
 	min.f64 	%fd69, %fd68, %fd30;
 	st.shared.f64 	[%rd8], %fd69;
 
-BB17_11:
+BB19_11:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB17_12:
+BB19_12:
 	mov.f64 	%fd66, %fd68;
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB17_16;
+	@%p8 bra 	BB19_16;
 
 	setp.gt.u32	%p9, %r10, 127;
 	mov.f64 	%fd67, %fd66;
-	@%p9 bra 	BB17_15;
+	@%p9 bra 	BB19_15;
 
 	ld.shared.f64 	%fd31, [%rd8+1024];
 	min.f64 	%fd67, %fd66, %fd31;
 	st.shared.f64 	[%rd8], %fd67;
 
-BB17_15:
+BB19_15:
 	mov.f64 	%fd66, %fd67;
 	bar.sync 	0;
 
-BB17_16:
+BB19_16:
 	mov.f64 	%fd64, %fd66;
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB17_20;
+	@%p10 bra 	BB19_20;
 
 	setp.gt.u32	%p11, %r10, 63;
 	mov.f64 	%fd65, %fd64;
-	@%p11 bra 	BB17_19;
+	@%p11 bra 	BB19_19;
 
 	ld.shared.f64 	%fd32, [%rd8+512];
 	min.f64 	%fd65, %fd64, %fd32;
 	st.shared.f64 	[%rd8], %fd65;
 
-BB17_19:
+BB19_19:
 	mov.f64 	%fd64, %fd65;
 	bar.sync 	0;
 
-BB17_20:
+BB19_20:
 	mov.f64 	%fd63, %fd64;
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB17_33;
+	@%p12 bra 	BB19_33;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB17_23;
+	@%p13 bra 	BB19_23;
 
 	ld.volatile.shared.f64 	%fd33, [%rd8+256];
 	min.f64 	%fd63, %fd63, %fd33;
 	st.volatile.shared.f64 	[%rd8], %fd63;
 
-BB17_23:
+BB19_23:
 	mov.f64 	%fd62, %fd63;
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB17_25;
+	@%p14 bra 	BB19_25;
 
 	ld.volatile.shared.f64 	%fd34, [%rd8+128];
 	min.f64 	%fd62, %fd62, %fd34;
 	st.volatile.shared.f64 	[%rd8], %fd62;
 
-BB17_25:
+BB19_25:
 	mov.f64 	%fd61, %fd62;
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB17_27;
+	@%p15 bra 	BB19_27;
 
 	ld.volatile.shared.f64 	%fd35, [%rd8+64];
 	min.f64 	%fd61, %fd61, %fd35;
 	st.volatile.shared.f64 	[%rd8], %fd61;
 
-BB17_27:
+BB19_27:
 	mov.f64 	%fd60, %fd61;
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB17_29;
+	@%p16 bra 	BB19_29;
 
 	ld.volatile.shared.f64 	%fd36, [%rd8+32];
 	min.f64 	%fd60, %fd60, %fd36;
 	st.volatile.shared.f64 	[%rd8], %fd60;
 
-BB17_29:
+BB19_29:
 	mov.f64 	%fd59, %fd60;
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB17_31;
+	@%p17 bra 	BB19_31;
 
 	ld.volatile.shared.f64 	%fd37, [%rd8+16];
 	min.f64 	%fd59, %fd59, %fd37;
 	st.volatile.shared.f64 	[%rd8], %fd59;
 
-BB17_31:
+BB19_31:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB17_33;
+	@%p18 bra 	BB19_33;
 
 	ld.volatile.shared.f64 	%fd38, [%rd8+8];
 	min.f64 	%fd39, %fd59, %fd38;
 	st.volatile.shared.f64 	[%rd8], %fd39;
 
-BB17_33:
+BB19_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB17_35;
+	@%p19 bra 	BB19_35;
 
 	ld.shared.f64 	%fd40, [sdata];
 	cvta.to.global.u64 	%rd39, %rd2;
@@ -3064,7 +3209,7 @@ BB17_33:
 	add.s64 	%rd41, %rd39, %rd40;
 	st.global.f64 	[%rd41], %fd40;
 
-BB17_35:
+BB19_35:
 	ret;
 }
 
@@ -3091,18 +3236,18 @@ BB17_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB18_5;
+	@%p1 bra 	BB20_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd8, 0d7FEFFFFFFFFFFFFF;
 	mov.f64 	%fd9, %fd8;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB18_4;
+	@%p2 bra 	BB20_4;
 
 	mov.u32 	%r10, %r1;
 
-BB18_3:
+BB20_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -3112,15 +3257,15 @@ BB18_3:
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
 	mov.f64 	%fd8, %fd9;
-	@%p3 bra 	BB18_3;
+	@%p3 bra 	BB20_3;
 
-BB18_4:
+BB20_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd8;
 
-BB18_5:
+BB20_5:
 	ret;
 }
 
@@ -3148,9 +3293,9 @@ BB18_5:
 	mov.f64 	%fd76, 0d3FF0000000000000;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB19_4;
+	@%p1 bra 	BB21_4;
 
-BB19_1:
+BB21_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -3159,23 +3304,23 @@ BB19_1:
 	mul.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB19_3;
+	@%p2 bra 	BB21_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	mul.f64 	%fd78, %fd78, %fd31;
 
-BB19_3:
+BB21_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB19_1;
+	@%p3 bra 	BB21_1;
 
-BB19_4:
+BB21_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -3183,130 +3328,130 @@ BB19_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB19_8;
+	@%p4 bra 	BB21_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB19_7;
+	@%p5 bra 	BB21_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	mul.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB19_7:
+BB21_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB19_8:
+BB21_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB19_12;
+	@%p6 bra 	BB21_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB19_11;
+	@%p7 bra 	BB21_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	mul.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB19_11:
+BB21_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB19_12:
+BB21_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB19_16;
+	@%p8 bra 	BB21_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB19_15;
+	@%p9 bra 	BB21_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	mul.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB19_15:
+BB21_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB19_16:
+BB21_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB19_20;
+	@%p10 bra 	BB21_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB19_19;
+	@%p11 bra 	BB21_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	mul.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB19_19:
+BB21_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB19_20:
+BB21_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB19_33;
+	@%p12 bra 	BB21_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB19_23;
+	@%p13 bra 	BB21_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	mul.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB19_23:
+BB21_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB19_25;
+	@%p14 bra 	BB21_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	mul.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB19_25:
+BB21_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB19_27;
+	@%p15 bra 	BB21_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	mul.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB19_27:
+BB21_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB19_29;
+	@%p16 bra 	BB21_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	mul.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB19_29:
+BB21_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB19_31;
+	@%p17 bra 	BB21_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	mul.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB19_31:
+BB21_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB19_33;
+	@%p18 bra 	BB21_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	mul.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB19_33:
+BB21_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB19_35;
+	@%p19 bra 	BB21_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -3314,7 +3459,7 @@ BB19_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB19_35:
+BB21_35:
 	ret;
 }
 
@@ -3338,17 +3483,17 @@ BB19_35:
 	ld.param.u32 	%r4, [reduce_row_mean_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB20_35;
+	@%p1 bra 	BB22_35;
 
 	mov.u32 	%r38, %tid.x;
 	mov.f64 	%fd74, 0d0000000000000000;
 	mov.f64 	%fd75, %fd74;
 	setp.ge.u32	%p2, %r38, %r4;
-	@%p2 bra 	BB20_4;
+	@%p2 bra 	BB22_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB20_3:
+BB22_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r38;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -3358,9 +3503,9 @@ BB20_3:
 	add.s32 	%r38, %r9, %r38;
 	setp.lt.u32	%p3, %r38, %r4;
 	mov.f64 	%fd74, %fd75;
-	@%p3 bra 	BB20_3;
+	@%p3 bra 	BB22_3;
 
-BB20_4:
+BB22_4:
 	mov.f64 	%fd72, %fd74;
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
@@ -3370,130 +3515,130 @@ BB20_4:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB20_8;
+	@%p4 bra 	BB22_8;
 
 	setp.gt.u32	%p5, %r10, 511;
 	mov.f64 	%fd73, %fd72;
-	@%p5 bra 	BB20_7;
+	@%p5 bra 	BB22_7;
 
 	ld.shared.f64 	%fd29, [%rd8+4096];
 	add.f64 	%fd73, %fd72, %fd29;
 	st.shared.f64 	[%rd8], %fd73;
 
-BB20_7:
+BB22_7:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB20_8:
+BB22_8:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB20_12;
+	@%p6 bra 	BB22_12;
 
 	setp.gt.u32	%p7, %r10, 255;
 	mov.f64 	%fd71, %fd70;
-	@%p7 bra 	BB20_11;
+	@%p7 bra 	BB22_11;
 
 	ld.shared.f64 	%fd30, [%rd8+2048];
 	add.f64 	%fd71, %fd70, %fd30;
 	st.shared.f64 	[%rd8], %fd71;
 
-BB20_11:
+BB22_11:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB20_12:
+BB22_12:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB20_16;
+	@%p8 bra 	BB22_16;
 
 	setp.gt.u32	%p9, %r10, 127;
 	mov.f64 	%fd69, %fd68;
-	@%p9 bra 	BB20_15;
+	@%p9 bra 	BB22_15;
 
 	ld.shared.f64 	%fd31, [%rd8+1024];
 	add.f64 	%fd69, %fd68, %fd31;
 	st.shared.f64 	[%rd8], %fd69;
 
-BB20_15:
+BB22_15:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB20_16:
+BB22_16:
 	mov.f64 	%fd66, %fd68;
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB20_20;
+	@%p10 bra 	BB22_20;
 
 	setp.gt.u32	%p11, %r10, 63;
 	mov.f64 	%fd67, %fd66;
-	@%p11 bra 	BB20_19;
+	@%p11 bra 	BB22_19;
 
 	ld.shared.f64 	%fd32, [%rd8+512];
 	add.f64 	%fd67, %fd66, %fd32;
 	st.shared.f64 	[%rd8], %fd67;
 
-BB20_19:
+BB22_19:
 	mov.f64 	%fd66, %fd67;
 	bar.sync 	0;
 
-BB20_20:
+BB22_20:
 	mov.f64 	%fd65, %fd66;
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB20_33;
+	@%p12 bra 	BB22_33;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB20_23;
+	@%p13 bra 	BB22_23;
 
 	ld.volatile.shared.f64 	%fd33, [%rd8+256];
 	add.f64 	%fd65, %fd65, %fd33;
 	st.volatile.shared.f64 	[%rd8], %fd65;
 
-BB20_23:
+BB22_23:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB20_25;
+	@%p14 bra 	BB22_25;
 
 	ld.volatile.shared.f64 	%fd34, [%rd8+128];
 	add.f64 	%fd64, %fd64, %fd34;
 	st.volatile.shared.f64 	[%rd8], %fd64;
 
-BB20_25:
+BB22_25:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB20_27;
+	@%p15 bra 	BB22_27;
 
 	ld.volatile.shared.f64 	%fd35, [%rd8+64];
 	add.f64 	%fd63, %fd63, %fd35;
 	st.volatile.shared.f64 	[%rd8], %fd63;
 
-BB20_27:
+BB22_27:
 	mov.f64 	%fd62, %fd63;
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB20_29;
+	@%p16 bra 	BB22_29;
 
 	ld.volatile.shared.f64 	%fd36, [%rd8+32];
 	add.f64 	%fd62, %fd62, %fd36;
 	st.volatile.shared.f64 	[%rd8], %fd62;
 
-BB20_29:
+BB22_29:
 	mov.f64 	%fd61, %fd62;
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB20_31;
+	@%p17 bra 	BB22_31;
 
 	ld.volatile.shared.f64 	%fd37, [%rd8+16];
 	add.f64 	%fd61, %fd61, %fd37;
 	st.volatile.shared.f64 	[%rd8], %fd61;
 
-BB20_31:
+BB22_31:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB20_33;
+	@%p18 bra 	BB22_33;
 
 	ld.volatile.shared.f64 	%fd38, [%rd8+8];
 	add.f64 	%fd39, %fd61, %fd38;
 	st.volatile.shared.f64 	[%rd8], %fd39;
 
-BB20_33:
+BB22_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB20_35;
+	@%p19 bra 	BB22_35;
 
 	ld.shared.f64 	%fd40, [sdata];
 	cvt.u64.u32	%rd39, %r4;
@@ -3504,7 +3649,7 @@ BB20_33:
 	add.s64 	%rd42, %rd40, %rd41;
 	st.global.f64 	[%rd42], %fd42;
 
-BB20_35:
+BB22_35:
 	ret;
 }
 
@@ -3531,18 +3676,18 @@ BB20_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB21_5;
+	@%p1 bra 	BB23_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd10, 0d0000000000000000;
 	mov.f64 	%fd11, %fd10;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB21_4;
+	@%p2 bra 	BB23_4;
 
 	mov.u32 	%r10, %r1;
 
-BB21_3:
+BB23_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -3552,9 +3697,9 @@ BB21_3:
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
 	mov.f64 	%fd10, %fd11;
-	@%p3 bra 	BB21_3;
+	@%p3 bra 	BB23_3;
 
-BB21_4:
+BB23_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	cvt.u64.u32	%rd7, %r5;
 	cvt.rn.f64.s64	%fd7, %rd7;
@@ -3563,7 +3708,7 @@ BB21_4:
 	add.s64 	%rd9, %rd6, %rd8;
 	st.global.f64 	[%rd9], %fd8;
 
-BB21_5:
+BB23_5:
 	ret;
 }
 
@@ -3589,7 +3734,7 @@ BB21_5:
 	mov.u32 	%r8, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r6, %r8;
 	setp.ge.u32	%p1, %r1, %r5;
-	@%p1 bra 	BB22_5;
+	@%p1 bra 	BB24_5;
 
 	cvta.to.global.u64 	%rd4, %rd2;
 	cvt.s64.s32	%rd1, %r1;
@@ -3649,13 +3794,13 @@ BB21_5:
 	mov.b32 	 %f2, %r11;
 	abs.f32 	%f1, %f2;
 	setp.lt.f32	%p2, %f1, 0f4086232B;
-	@%p2 bra 	BB22_4;
+	@%p2 bra 	BB24_4;
 
 	setp.lt.f64	%p3, %fd1, 0d0000000000000000;
 	add.f64 	%fd37, %fd1, 0d7FF0000000000000;
 	selp.f64	%fd40, 0d0000000000000000, %fd37, %p3;
 	setp.geu.f32	%p4, %f1, 0f40874800;
-	@%p4 bra 	BB22_4;
+	@%p4 bra 	BB24_4;
 
 	shr.u32 	%r12, %r2, 31;
 	add.s32 	%r13, %r2, %r12;
@@ -3670,13 +3815,13 @@ BB21_5:
 	mov.b64 	%fd39, {%r20, %r19};
 	mul.f64 	%fd40, %fd38, %fd39;
 
-BB22_4:
+BB24_4:
 	cvta.to.global.u64 	%rd7, %rd3;
 	shl.b64 	%rd8, %rd1, 3;
 	add.s64 	%rd9, %rd7, %rd8;
 	st.global.f64 	[%rd9], %fd40;
 
-BB22_5:
+BB24_5:
 	ret;
 }
 
@@ -3701,7 +3846,7 @@ BB22_5:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.u32	%p1, %r1, %r2;
-	@%p1 bra 	BB23_2;
+	@%p1 bra 	BB25_2;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.s32 	%rd4, %r1, 8;
@@ -3712,7 +3857,7 @@ BB22_5:
 	add.s64 	%rd7, %rd6, %rd4;
 	st.global.f64 	[%rd7], %fd2;
 
-BB23_2:
+BB25_2:
 	ret;
 }
 
@@ -3737,7 +3882,7 @@ BB23_2:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.u32	%p1, %r1, %r2;
-	@%p1 bra 	BB24_4;
+	@%p1 bra 	BB26_4;
 
 	cvta.to.global.u64 	%rd4, %rd2;
 	cvt.s64.s32	%rd1, %r1;
@@ -3746,7 +3891,7 @@ BB23_2:
 	ld.global.f64 	%fd9, [%rd6];
 	abs.f64 	%fd2, %fd9;
 	setp.ge.f64	%p2, %fd2, 0d4330000000000000;
-	@%p2 bra 	BB24_3;
+	@%p2 bra 	BB26_3;
 
 	add.f64 	%fd5, %fd2, 0d3FE0000000000000;
 	cvt.rzi.f64.f64	%fd6, %fd5;
@@ -3768,7 +3913,7 @@ BB23_2:
 	or.b32  	%r10, %r7, %r9;
 	mov.b64 	%fd9, {%r6, %r10};
 
-BB24_3:
+BB26_3:
 	cvta.to.global.u64 	%rd7, %rd3;
 	cvt.rzi.s64.f64	%rd8, %fd9;
 	cvt.rn.f64.s64	%fd8, %rd8;
@@ -3776,7 +3921,7 @@ BB24_3:
 	add.s64 	%rd10, %rd7, %rd9;
 	st.global.f64 	[%rd10], %fd8;
 
-BB24_4:
+BB26_4:
 	ret;
 }
 
@@ -3801,7 +3946,7 @@ BB24_4:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.u32	%p1, %r1, %r2;
-	@%p1 bra 	BB25_2;
+	@%p1 bra 	BB27_2;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.s32 	%rd4, %r1, 8;
@@ -3812,7 +3957,7 @@ BB24_4:
 	add.s64 	%rd7, %rd6, %rd4;
 	st.global.f64 	[%rd7], %fd2;
 
-BB25_2:
+BB27_2:
 	ret;
 }
 
@@ -3838,7 +3983,7 @@ BB25_2:
 	mov.u32 	%r15, %tid.x;
 	mad.lo.s32 	%r1, %r14, %r13, %r15;
 	setp.ge.u32	%p1, %r1, %r12;
-	@%p1 bra 	BB26_9;
+	@%p1 bra 	BB28_9;
 
 	cvta.to.global.u64 	%rd4, %rd2;
 	cvt.s64.s32	%rd1, %r1;
@@ -3855,7 +4000,7 @@ BB25_2:
 	}
 	mov.u32 	%r31, -1023;
 	setp.gt.s32	%p2, %r29, 1048575;
-	@%p2 bra 	BB26_3;
+	@%p2 bra 	BB28_3;
 
 	mul.f64 	%fd56, %fd56, 0d4350000000000000;
 	{
@@ -3868,20 +4013,20 @@ BB25_2:
 	}
 	mov.u32 	%r31, -1077;
 
-BB26_3:
+BB28_3:
 	add.s32 	%r18, %r29, -1;
 	setp.lt.u32	%p3, %r18, 2146435071;
-	@%p3 bra 	BB26_5;
-	bra.uni 	BB26_4;
+	@%p3 bra 	BB28_5;
+	bra.uni 	BB28_4;
 
-BB26_5:
+BB28_5:
 	shr.u32 	%r20, %r29, 20;
 	add.s32 	%r32, %r31, %r20;
 	and.b32  	%r21, %r29, -2146435073;
 	or.b32  	%r22, %r21, 1072693248;
 	mov.b64 	%fd57, {%r30, %r22};
 	setp.lt.s32	%p5, %r22, 1073127583;
-	@%p5 bra 	BB26_7;
+	@%p5 bra 	BB28_7;
 
 	{
 	.reg .b32 %temp; 
@@ -3895,7 +4040,7 @@ BB26_5:
 	mov.b64 	%fd57, {%r23, %r25};
 	add.s32 	%r32, %r32, 1;
 
-BB26_7:
+BB28_7:
 	add.f64 	%fd13, %fd57, 0d3FF0000000000000;
 	// inline asm
 	rcp.approx.ftz.f64 %fd12,%fd13;
@@ -3946,9 +4091,9 @@ BB26_7:
 	mov.f64 	%fd54, 0d3C7ABC9E3B39803F;
 	fma.rn.f64 	%fd55, %fd47, %fd54, %fd53;
 	add.f64 	%fd58, %fd49, %fd55;
-	bra.uni 	BB26_8;
+	bra.uni 	BB28_8;
 
-BB26_4:
+BB28_4:
 	mov.f64 	%fd10, 0d7FF0000000000000;
 	fma.rn.f64 	%fd11, %fd56, %fd10, %fd10;
 	{
@@ -3959,13 +4104,13 @@ BB26_4:
 	setp.eq.f32	%p4, %f1, 0f00000000;
 	selp.f64	%fd58, 0dFFF0000000000000, %fd11, %p4;
 
-BB26_8:
+BB28_8:
 	cvta.to.global.u64 	%rd7, %rd3;
 	shl.b64 	%rd8, %rd1, 3;
 	add.s64 	%rd9, %rd7, %rd8;
 	st.global.f64 	[%rd9], %fd58;
 
-BB26_9:
+BB28_9:
 	ret;
 }
 
@@ -3990,7 +4135,7 @@ BB26_9:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.u32	%p1, %r1, %r2;
-	@%p1 bra 	BB27_2;
+	@%p1 bra 	BB29_2;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.s32 	%rd4, %r1, 8;
@@ -4001,7 +4146,7 @@ BB26_9:
 	add.s64 	%rd7, %rd6, %rd4;
 	st.global.f64 	[%rd7], %fd2;
 
-BB27_2:
+BB29_2:
 	ret;
 }
 
@@ -4026,7 +4171,7 @@ BB27_2:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.u32	%p1, %r1, %r2;
-	@%p1 bra 	BB28_2;
+	@%p1 bra 	BB30_2;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.s32 	%rd4, %r1, 8;
@@ -4037,7 +4182,7 @@ BB27_2:
 	add.s64 	%rd7, %rd6, %rd4;
 	st.global.f64 	[%rd7], %fd2;
 
-BB28_2:
+BB30_2:
 	ret;
 }
 
@@ -4048,7 +4193,7 @@ BB28_2:
 	.param .u32 matrix_sin_param_2
 )
 {
-	.local .align 4 .b8 	__local_depot29[4];
+	.local .align 4 .b8 	__local_depot31[4];
 	.reg .b64 	%SP;
 	.reg .b64 	%SPL;
 	.reg .pred 	%p<7>;
@@ -4057,7 +4202,7 @@ BB28_2:
 	.reg .b64 	%rd<17>;
 
 
-	mov.u64 	%rd16, __local_depot29;
+	mov.u64 	%rd16, __local_depot31;
 	cvta.local.u64 	%SP, %rd16;
 	ld.param.u64 	%rd3, [matrix_sin_param_0];
 	ld.param.u64 	%rd4, [matrix_sin_param_1];
@@ -4069,7 +4214,7 @@ BB28_2:
 	mov.u32 	%r8, %tid.x;
 	mad.lo.s32 	%r1, %r6, %r7, %r8;
 	setp.ge.u32	%p1, %r1, %r5;
-	@%p1 bra 	BB29_11;
+	@%p1 bra 	BB31_11;
 
 	cvta.to.global.u64 	%rd6, %rd3;
 	cvt.s64.s32	%rd2, %r1;
@@ -4082,19 +4227,19 @@ BB28_2:
 	}
 	and.b32  	%r10, %r9, 2147483647;
 	setp.ne.s32	%p2, %r10, 2146435072;
-	@%p2 bra 	BB29_4;
+	@%p2 bra 	BB31_4;
 
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%r11, %temp}, %fd38;
 	}
 	setp.ne.s32	%p3, %r11, 0;
-	@%p3 bra 	BB29_4;
+	@%p3 bra 	BB31_4;
 
 	mov.f64 	%fd14, 0d0000000000000000;
 	mul.rn.f64 	%fd38, %fd38, %fd14;
 
-BB29_4:
+BB31_4:
 	mul.f64 	%fd15, %fd38, 0d3FE45F306DC9C883;
 	cvt.rni.s32.f64	%r17, %fd15;
 	st.local.u32 	[%rd1], %r17;
@@ -4112,7 +4257,7 @@ BB29_4:
 	}
 	and.b32  	%r13, %r12, 2145386496;
 	setp.lt.u32	%p4, %r13, 1105199104;
-	@%p4 bra 	BB29_6;
+	@%p4 bra 	BB31_6;
 
 	// Callseq Start 3
 	{
@@ -4135,7 +4280,7 @@ BB29_4:
 	}// Callseq End 3
 	ld.local.u32 	%r17, [%rd1];
 
-BB29_6:
+BB31_6:
 	and.b32  	%r14, %r17, 1;
 	shl.b32 	%r15, %r14, 3;
 	setp.eq.s32	%p5, %r14, 0;
@@ -4157,27 +4302,27 @@ BB29_6:
 	ld.const.f64 	%fd34, [%rd12+48];
 	fma.rn.f64 	%fd8, %fd33, %fd7, %fd34;
 	fma.rn.f64 	%fd40, %fd8, %fd39, %fd39;
-	@%p5 bra 	BB29_8;
+	@%p5 bra 	BB31_8;
 
 	mov.f64 	%fd35, 0d3FF0000000000000;
 	fma.rn.f64 	%fd40, %fd8, %fd7, %fd35;
 
-BB29_8:
+BB31_8:
 	and.b32  	%r16, %r17, 2;
 	setp.eq.s32	%p6, %r16, 0;
-	@%p6 bra 	BB29_10;
+	@%p6 bra 	BB31_10;
 
 	mov.f64 	%fd36, 0d0000000000000000;
 	mov.f64 	%fd37, 0dBFF0000000000000;
 	fma.rn.f64 	%fd40, %fd40, %fd37, %fd36;
 
-BB29_10:
+BB31_10:
 	cvta.to.global.u64 	%rd13, %rd4;
 	shl.b64 	%rd14, %rd2, 3;
 	add.s64 	%rd15, %rd13, %rd14;
 	st.global.f64 	[%rd15], %fd40;
 
-BB29_11:
+BB31_11:
 	ret;
 }
 
@@ -4188,7 +4333,7 @@ BB29_11:
 	.param .u32 matrix_cos_param_2
 )
 {
-	.local .align 4 .b8 	__local_depot30[4];
+	.local .align 4 .b8 	__local_depot32[4];
 	.reg .b64 	%SP;
 	.reg .b64 	%SPL;
 	.reg .pred 	%p<7>;
@@ -4197,7 +4342,7 @@ BB29_11:
 	.reg .b64 	%rd<17>;
 
 
-	mov.u64 	%rd16, __local_depot30;
+	mov.u64 	%rd16, __local_depot32;
 	cvta.local.u64 	%SP, %rd16;
 	ld.param.u64 	%rd3, [matrix_cos_param_0];
 	ld.param.u64 	%rd4, [matrix_cos_param_1];
@@ -4209,7 +4354,7 @@ BB29_11:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB30_11;
+	@%p1 bra 	BB32_11;
 
 	cvta.to.global.u64 	%rd6, %rd3;
 	cvt.s64.s32	%rd2, %r1;
@@ -4222,19 +4367,19 @@ BB29_11:
 	}
 	and.b32  	%r11, %r10, 2147483647;
 	setp.ne.s32	%p2, %r11, 2146435072;
-	@%p2 bra 	BB30_4;
+	@%p2 bra 	BB32_4;
 
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%r12, %temp}, %fd38;
 	}
 	setp.ne.s32	%p3, %r12, 0;
-	@%p3 bra 	BB30_4;
+	@%p3 bra 	BB32_4;
 
 	mov.f64 	%fd14, 0d0000000000000000;
 	mul.rn.f64 	%fd38, %fd38, %fd14;
 
-BB30_4:
+BB32_4:
 	mul.f64 	%fd15, %fd38, 0d3FE45F306DC9C883;
 	cvt.rni.s32.f64	%r18, %fd15;
 	st.local.u32 	[%rd1], %r18;
@@ -4252,7 +4397,7 @@ BB30_4:
 	}
 	and.b32  	%r14, %r13, 2145386496;
 	setp.lt.u32	%p4, %r14, 1105199104;
-	@%p4 bra 	BB30_6;
+	@%p4 bra 	BB32_6;
 
 	// Callseq Start 4
 	{
@@ -4275,7 +4420,7 @@ BB30_4:
 	}// Callseq End 4
 	ld.local.u32 	%r18, [%rd1];
 
-BB30_6:
+BB32_6:
 	add.s32 	%r5, %r18, 1;
 	and.b32  	%r15, %r5, 1;
 	shl.b32 	%r16, %r15, 3;
@@ -4298,27 +4443,27 @@ BB30_6:
 	ld.const.f64 	%fd34, [%rd12+48];
 	fma.rn.f64 	%fd8, %fd33, %fd7, %fd34;
 	fma.rn.f64 	%fd40, %fd8, %fd39, %fd39;
-	@%p5 bra 	BB30_8;
+	@%p5 bra 	BB32_8;
 
 	mov.f64 	%fd35, 0d3FF0000000000000;
 	fma.rn.f64 	%fd40, %fd8, %fd7, %fd35;
 
-BB30_8:
+BB32_8:
 	and.b32  	%r17, %r5, 2;
 	setp.eq.s32	%p6, %r17, 0;
-	@%p6 bra 	BB30_10;
+	@%p6 bra 	BB32_10;
 
 	mov.f64 	%fd36, 0d0000000000000000;
 	mov.f64 	%fd37, 0dBFF0000000000000;
 	fma.rn.f64 	%fd40, %fd40, %fd37, %fd36;
 
-BB30_10:
+BB32_10:
 	cvta.to.global.u64 	%rd13, %rd4;
 	shl.b64 	%rd14, %rd2, 3;
 	add.s64 	%rd15, %rd13, %rd14;
 	st.global.f64 	[%rd15], %fd40;
 
-BB30_11:
+BB32_11:
 	ret;
 }
 
@@ -4329,7 +4474,7 @@ BB30_11:
 	.param .u32 matrix_tan_param_2
 )
 {
-	.local .align 4 .b8 	__local_depot31[4];
+	.local .align 4 .b8 	__local_depot33[4];
 	.reg .b64 	%SP;
 	.reg .b64 	%SPL;
 	.reg .pred 	%p<6>;
@@ -4338,7 +4483,7 @@ BB30_11:
 	.reg .b64 	%rd<14>;
 
 
-	mov.u64 	%rd13, __local_depot31;
+	mov.u64 	%rd13, __local_depot33;
 	cvta.local.u64 	%SP, %rd13;
 	ld.param.u64 	%rd3, [matrix_tan_param_0];
 	ld.param.u64 	%rd4, [matrix_tan_param_1];
@@ -4350,7 +4495,7 @@ BB30_11:
 	mov.u32 	%r8, %tid.x;
 	mad.lo.s32 	%r1, %r6, %r7, %r8;
 	setp.ge.u32	%p1, %r1, %r5;
-	@%p1 bra 	BB31_9;
+	@%p1 bra 	BB33_9;
 
 	cvta.to.global.u64 	%rd6, %rd3;
 	cvt.s64.s32	%rd2, %r1;
@@ -4363,19 +4508,19 @@ BB30_11:
 	}
 	and.b32  	%r10, %r9, 2147483647;
 	setp.ne.s32	%p2, %r10, 2146435072;
-	@%p2 bra 	BB31_4;
+	@%p2 bra 	BB33_4;
 
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%r11, %temp}, %fd63;
 	}
 	setp.ne.s32	%p3, %r11, 0;
-	@%p3 bra 	BB31_4;
+	@%p3 bra 	BB33_4;
 
 	mov.f64 	%fd11, 0d0000000000000000;
 	mul.rn.f64 	%fd63, %fd63, %fd11;
 
-BB31_4:
+BB33_4:
 	mul.f64 	%fd12, %fd63, 0d3FE45F306DC9C883;
 	cvt.rni.s32.f64	%r15, %fd12;
 	st.local.u32 	[%rd1], %r15;
@@ -4393,7 +4538,7 @@ BB31_4:
 	}
 	and.b32  	%r13, %r12, 2145386496;
 	setp.lt.u32	%p4, %r13, 1105199104;
-	@%p4 bra 	BB31_6;
+	@%p4 bra 	BB33_6;
 
 	// Callseq Start 5
 	{
@@ -4416,7 +4561,7 @@ BB31_4:
 	}// Callseq End 5
 	ld.local.u32 	%r15, [%rd1];
 
-BB31_6:
+BB33_6:
 	mul.f64 	%fd20, %fd64, %fd64;
 	mov.f64 	%fd21, 0dBEF9757C5B27EBB1;
 	mov.f64 	%fd22, 0d3EE48DAC2799BCB9;
@@ -4451,10 +4596,10 @@ BB31_6:
 	fma.rn.f64 	%fd65, %fd7, %fd64, %fd64;
 	and.b32  	%r14, %r15, 1;
 	setp.eq.b32	%p5, %r14, 1;
-	@!%p5 bra 	BB31_8;
-	bra.uni 	BB31_7;
+	@!%p5 bra 	BB33_8;
+	bra.uni 	BB33_7;
 
-BB31_7:
+BB33_7:
 	sub.f64 	%fd52, %fd65, %fd64;
 	neg.f64 	%fd53, %fd52;
 	fma.rn.f64 	%fd54, %fd7, %fd64, %fd53;
@@ -4471,13 +4616,13 @@ BB31_7:
 	fma.rn.f64 	%fd62, %fd60, %fd54, %fd61;
 	fma.rn.f64 	%fd65, %fd62, %fd60, %fd60;
 
-BB31_8:
+BB33_8:
 	cvta.to.global.u64 	%rd10, %rd4;
 	shl.b64 	%rd11, %rd2, 3;
 	add.s64 	%rd12, %rd10, %rd11;
 	st.global.f64 	[%rd12], %fd65;
 
-BB31_9:
+BB33_9:
 	ret;
 }
 
@@ -4503,7 +4648,7 @@ BB31_9:
 	mov.u32 	%r6, %tid.x;
 	mad.lo.s32 	%r1, %r5, %r4, %r6;
 	setp.ge.u32	%p1, %r1, %r3;
-	@%p1 bra 	BB32_5;
+	@%p1 bra 	BB34_5;
 
 	cvta.to.global.u64 	%rd4, %rd2;
 	cvt.s64.s32	%rd1, %r1;
@@ -4517,10 +4662,10 @@ BB31_9:
 	mov.b32 	 %f1, %r2;
 	abs.f32 	%f2, %f1;
 	setp.lt.f32	%p2, %f2, 0f3FE26666;
-	@%p2 bra 	BB32_3;
-	bra.uni 	BB32_2;
+	@%p2 bra 	BB34_3;
+	bra.uni 	BB34_2;
 
-BB32_3:
+BB34_3:
 	mul.f64 	%fd55, %fd1, %fd1;
 	mov.f64 	%fd56, 0dBFB3823B180754AF;
 	mov.f64 	%fd57, 0d3FB0066BDC1895E9;
@@ -4549,9 +4694,9 @@ BB32_3:
 	fma.rn.f64 	%fd80, %fd78, %fd55, %fd79;
 	mul.f64 	%fd81, %fd55, %fd80;
 	fma.rn.f64 	%fd82, %fd81, %fd1, %fd1;
-	bra.uni 	BB32_4;
+	bra.uni 	BB34_4;
 
-BB32_2:
+BB34_2:
 	abs.f64 	%fd7, %fd1;
 	mov.f64 	%fd8, 0d3FE0000000000000;
 	mov.f64 	%fd9, 0dBFE0000000000000;
@@ -4631,13 +4776,13 @@ BB32_2:
 	or.b32  	%r14, %r12, %r13;
 	mov.b64 	%fd82, {%r11, %r14};
 
-BB32_4:
+BB34_4:
 	cvta.to.global.u64 	%rd7, %rd3;
 	shl.b64 	%rd8, %rd1, 3;
 	add.s64 	%rd9, %rd7, %rd8;
 	st.global.f64 	[%rd9], %fd82;
 
-BB32_5:
+BB34_5:
 	ret;
 }
 
@@ -4662,7 +4807,7 @@ BB32_5:
 	mov.u32 	%r7, %tid.x;
 	mad.lo.s32 	%r1, %r6, %r5, %r7;
 	setp.ge.u32	%p1, %r1, %r4;
-	@%p1 bra 	BB33_14;
+	@%p1 bra 	BB35_14;
 
 	cvta.to.global.u64 	%rd4, %rd2;
 	cvt.s64.s32	%rd1, %r1;
@@ -4679,10 +4824,10 @@ BB32_5:
 	mov.b64 	{%temp, %r8}, %fd1;
 	}
 	setp.lt.s32	%p2, %r8, 1071801958;
-	@%p2 bra 	BB33_9;
-	bra.uni 	BB33_2;
+	@%p2 bra 	BB35_9;
+	bra.uni 	BB35_2;
 
-BB33_9:
+BB35_9:
 	mul.f64 	%fd62, %fd1, %fd1;
 	mov.f64 	%fd63, 0dBFB3823B180754AF;
 	mov.f64 	%fd64, 0d3FB0066BDC1895E9;
@@ -4712,14 +4857,14 @@ BB33_9:
 	mul.f64 	%fd88, %fd62, %fd87;
 	fma.rn.f64 	%fd10, %fd88, %fd1, %fd1;
 	setp.lt.s32	%p6, %r2, 0;
-	@%p6 bra 	BB33_11;
+	@%p6 bra 	BB35_11;
 
 	mov.f64 	%fd89, 0dBC91A62633145C07;
 	add.rn.f64 	%fd90, %fd10, %fd89;
 	neg.f64 	%fd93, %fd90;
-	bra.uni 	BB33_12;
+	bra.uni 	BB35_12;
 
-BB33_2:
+BB35_2:
 	mov.f64 	%fd19, 0d3FF0000000000000;
 	sub.f64 	%fd2, %fd19, %fd1;
 	{
@@ -4755,7 +4900,7 @@ BB33_2:
 	fma.rn.f64 	%fd28, %fd24, %fd25, %fd18;
 	fma.rn.f64 	%fd3, %fd28, %fd27, %fd24;
 	setp.lt.s32	%p3, %r3, 1;
-	@%p3 bra 	BB33_4;
+	@%p3 bra 	BB35_4;
 
 	{
 	.reg .b32 %temp; 
@@ -4794,31 +4939,31 @@ BB33_2:
 	fma.rn.f64 	%fd54, %fd52, %fd2, %fd53;
 	mul.f64 	%fd55, %fd2, %fd54;
 	fma.rn.f64 	%fd94, %fd55, %fd29, %fd29;
-	bra.uni 	BB33_5;
+	bra.uni 	BB35_5;
 
-BB33_11:
+BB35_11:
 	mov.f64 	%fd91, 0d3C91A62633145C07;
 	add.rn.f64 	%fd93, %fd10, %fd91;
 
-BB33_12:
+BB35_12:
 	mov.f64 	%fd92, 0d3FF921FB54442D18;
 	add.rn.f64 	%fd94, %fd92, %fd93;
-	bra.uni 	BB33_13;
+	bra.uni 	BB35_13;
 
-BB33_4:
+BB35_4:
 	mov.f64 	%fd56, 0d0000000000000000;
 	mul.rn.f64 	%fd94, %fd1, %fd56;
 
-BB33_5:
+BB35_5:
 	setp.gt.s32	%p4, %r3, -1;
-	@%p4 bra 	BB33_7;
+	@%p4 bra 	BB35_7;
 
 	mov.f64 	%fd57, 0d7FF0000000000000;
 	mul.rn.f64 	%fd94, %fd94, %fd57;
 
-BB33_7:
+BB35_7:
 	setp.gt.s32	%p5, %r2, -1;
-	@%p5 bra 	BB33_13;
+	@%p5 bra 	BB35_13;
 
 	mov.f64 	%fd58, 0dBCA1A62633145C07;
 	add.rn.f64 	%fd59, %fd94, %fd58;
@@ -4826,13 +4971,13 @@ BB33_7:
 	mov.f64 	%fd61, 0d400921FB54442D18;
 	add.rn.f64 	%fd94, %fd61, %fd60;
 
-BB33_13:
+BB35_13:
 	cvta.to.global.u64 	%rd7, %rd3;
 	shl.b64 	%rd8, %rd1, 3;
 	add.s64 	%rd9, %rd7, %rd8;
 	st.global.f64 	[%rd9], %fd94;
 
-BB33_14:
+BB35_14:
 	ret;
 }
 
@@ -4857,7 +5002,7 @@ BB33_14:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.u32	%p1, %r1, %r2;
-	@%p1 bra 	BB34_4;
+	@%p1 bra 	BB36_4;
 
 	cvta.to.global.u64 	%rd4, %rd2;
 	cvt.s64.s32	%rd1, %r1;
@@ -4867,7 +5012,7 @@ BB33_14:
 	abs.f64 	%fd2, %fd1;
 	setp.leu.f64	%p2, %fd2, 0d3FF0000000000000;
 	mov.f64 	%fd56, %fd2;
-	@%p2 bra 	BB34_3;
+	@%p2 bra 	BB36_3;
 
 	// inline asm
 	rcp.approx.ftz.f64 %fd5,%fd2;
@@ -4881,7 +5026,7 @@ BB33_14:
 	selp.f64	%fd3, 0d0000000000000000, %fd11, %p3;
 	mov.f64 	%fd56, %fd3;
 
-BB34_3:
+BB36_3:
 	mov.f64 	%fd4, %fd56;
 	cvta.to.global.u64 	%rd7, %rd3;
 	mul.f64 	%fd12, %fd4, %fd4;
@@ -4947,7 +5092,7 @@ BB34_3:
 	add.s64 	%rd9, %rd7, %rd8;
 	st.global.f64 	[%rd9], %fd55;
 
-BB34_4:
+BB36_4:
 	ret;
 }
 
@@ -4972,7 +5117,7 @@ BB34_4:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.u32	%p1, %r1, %r2;
-	@%p1 bra 	BB35_4;
+	@%p1 bra 	BB37_4;
 
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.s32 	%rd5, %r1, 8;
@@ -4981,15 +5126,15 @@ BB34_4:
 	setp.eq.f64	%p2, %fd1, 0d0000000000000000;
 	cvta.to.global.u64 	%rd7, %rd3;
 	add.s64 	%rd1, %rd7, %rd5;
-	@%p2 bra 	BB35_3;
-	bra.uni 	BB35_2;
+	@%p2 bra 	BB37_3;
+	bra.uni 	BB37_2;
 
-BB35_3:
+BB37_3:
 	mov.u64 	%rd8, 0;
 	st.global.u64 	[%rd1], %rd8;
-	bra.uni 	BB35_4;
+	bra.uni 	BB37_4;
 
-BB35_2:
+BB37_2:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r6}, %fd1;
@@ -5009,7 +5154,7 @@ BB35_2:
 	mov.b64 	%fd3, {%r11, %r10};
 	st.global.f64 	[%rd1], %fd3;
 
-BB35_4:
+BB37_4:
 	ret;
 }
 
@@ -5018,7 +5163,7 @@ BB35_4:
 	.param .b64 __internal_trig_reduction_slowpathd_param_1
 )
 {
-	.local .align 8 .b8 	__local_depot36[40];
+	.local .align 8 .b8 	__local_depot38[40];
 	.reg .b64 	%SP;
 	.reg .b64 	%SPL;
 	.reg .pred 	%p<9>;
@@ -5027,7 +5172,7 @@ BB35_4:
 	.reg .b64 	%rd<101>;
 
 
-	mov.u64 	%rd100, __local_depot36;
+	mov.u64 	%rd100, __local_depot38;
 	cvta.local.u64 	%SP, %rd100;
 	ld.param.f64 	%fd4, [__internal_trig_reduction_slowpathd_param_0];
 	ld.param.u64 	%rd37, [__internal_trig_reduction_slowpathd_param_1];
@@ -5041,7 +5186,7 @@ BB35_4:
 	shr.u32 	%r3, %r1, 20;
 	bfe.u32 	%r4, %r1, 20, 11;
 	setp.eq.s32	%p1, %r4, 2047;
-	@%p1 bra 	BB36_13;
+	@%p1 bra 	BB38_13;
 
 	add.s32 	%r16, %r4, -1024;
 	shr.u32 	%r17, %r16, 6;
@@ -5054,7 +5199,7 @@ BB35_4:
 	setp.gt.s32	%p2, %r5, %r6;
 	mov.u64 	%rd94, 0;
 	mov.u64 	%rd93, %rd1;
-	@%p2 bra 	BB36_4;
+	@%p2 bra 	BB38_4;
 
 	mov.b64 	 %rd41, %fd4;
 	shl.b64 	%rd42, %rd41, 11;
@@ -5073,7 +5218,7 @@ BB35_4:
 	mov.u64 	%rd91, %rd1;
 	mov.u32 	%r39, %r7;
 
-BB36_3:
+BB38_3:
 	.pragma "nounroll";
 	mov.u32 	%r8, %r39;
 	mov.u64 	%rd7, %rd91;
@@ -5108,15 +5253,15 @@ BB36_3:
 	setp.lt.s32	%p3, %r9, %r6;
 	mov.u64 	%rd91, %rd13;
 	mov.u32 	%r39, %r9;
-	@%p3 bra 	BB36_3;
+	@%p3 bra 	BB38_3;
 
-BB36_4:
+BB38_4:
 	st.local.u64 	[%rd93], %rd94;
 	ld.local.u64 	%rd95, [%rd1+16];
 	ld.local.u64 	%rd96, [%rd1+24];
 	and.b32  	%r10, %r3, 63;
 	setp.eq.s32	%p4, %r10, 0;
-	@%p4 bra 	BB36_6;
+	@%p4 bra 	BB38_6;
 
 	mov.u32 	%r27, 64;
 	sub.s32 	%r28, %r27, %r10;
@@ -5128,7 +5273,7 @@ BB36_4:
 	shr.u64 	%rd56, %rd55, %r28;
 	or.b64  	%rd95, %rd56, %rd54;
 
-BB36_6:
+BB38_6:
 	cvta.to.local.u64 	%rd57, %rd37;
 	shr.u64 	%rd58, %rd96, 62;
 	cvt.u32.u64	%r29, %rd58;
@@ -5145,7 +5290,7 @@ BB36_6:
 	selp.b32	%r34, %r32, %r33, %p5;
 	st.local.u32 	[%rd57], %r34;
 	setp.eq.s32	%p6, %r31, 0;
-	@%p6 bra 	BB36_8;
+	@%p6 bra 	BB38_8;
 
 	mov.u64 	%rd65, 0;
 	// inline asm
@@ -5165,10 +5310,10 @@ BB36_6:
 	// inline asm
 	xor.b32  	%r40, %r40, -2147483648;
 
-BB36_8:
+BB38_8:
 	clz.b64 	%r41, %rd98;
 	setp.eq.s32	%p7, %r41, 0;
-	@%p7 bra 	BB36_10;
+	@%p7 bra 	BB38_10;
 
 	shl.b64 	%rd68, %rd98, %r41;
 	mov.u32 	%r35, 64;
@@ -5176,7 +5321,7 @@ BB36_8:
 	shr.u64 	%rd69, %rd97, %r36;
 	or.b64  	%rd98, %rd69, %rd68;
 
-BB36_10:
+BB38_10:
 	mov.u64 	%rd73, -3958705157555305931;
 	// inline asm
 	{
@@ -5197,7 +5342,7 @@ BB36_10:
 	}
 	// inline asm
 	setp.lt.s64	%p8, %rd99, 1;
-	@%p8 bra 	BB36_12;
+	@%p8 bra 	BB38_12;
 
 	// inline asm
 	{
@@ -5216,7 +5361,7 @@ BB36_10:
 	// inline asm
 	add.s32 	%r41, %r41, 1;
 
-BB36_12:
+BB38_12:
 	cvt.u64.u32	%rd80, %r40;
 	shl.b64 	%rd81, %rd80, 32;
 	mov.u32 	%r37, 1022;
@@ -5231,7 +5376,7 @@ BB36_12:
 	or.b64  	%rd89, %rd88, %rd81;
 	mov.b64 	 %fd4, %rd89;
 
-BB36_13:
+BB38_13:
 	st.param.f64	[func_retval0+0], %fd4;
 	ret;
 }
@@ -5259,7 +5404,7 @@ BB36_13:
 	}
 	shr.u32 	%r50, %r49, 20;
 	setp.ne.s32	%p1, %r50, 0;
-	@%p1 bra 	BB37_2;
+	@%p1 bra 	BB39_2;
 
 	mul.f64 	%fd14, %fd12, 0d4350000000000000;
 	{
@@ -5273,13 +5418,13 @@ BB36_13:
 	shr.u32 	%r16, %r49, 20;
 	add.s32 	%r50, %r16, -54;
 
-BB37_2:
+BB39_2:
 	add.s32 	%r51, %r50, -1023;
 	and.b32  	%r17, %r49, -2146435073;
 	or.b32  	%r18, %r17, 1072693248;
 	mov.b64 	%fd132, {%r48, %r18};
 	setp.lt.u32	%p2, %r18, 1073127583;
-	@%p2 bra 	BB37_4;
+	@%p2 bra 	BB39_4;
 
 	{
 	.reg .b32 %temp; 
@@ -5293,7 +5438,7 @@ BB37_2:
 	mov.b64 	%fd132, {%r19, %r21};
 	add.s32 	%r51, %r50, -1022;
 
-BB37_4:
+BB39_4:
 	add.f64 	%fd16, %fd132, 0d3FF0000000000000;
 	// inline asm
 	rcp.approx.ftz.f64 %fd15,%fd16;
@@ -5458,13 +5603,13 @@ BB37_4:
 	mov.b32 	 %f2, %r35;
 	abs.f32 	%f1, %f2;
 	setp.lt.f32	%p4, %f1, 0f4086232B;
-	@%p4 bra 	BB37_7;
+	@%p4 bra 	BB39_7;
 
 	setp.lt.f64	%p5, %fd4, 0d0000000000000000;
 	add.f64 	%fd129, %fd4, 0d7FF0000000000000;
 	selp.f64	%fd133, 0d0000000000000000, %fd129, %p5;
 	setp.geu.f32	%p6, %f1, 0f40874800;
-	@%p6 bra 	BB37_7;
+	@%p6 bra 	BB39_7;
 
 	shr.u32 	%r36, %r13, 31;
 	add.s32 	%r37, %r13, %r36;
@@ -5479,26 +5624,26 @@ BB37_4:
 	mov.b64 	%fd131, {%r44, %r43};
 	mul.f64 	%fd133, %fd130, %fd131;
 
-BB37_7:
+BB39_7:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r45}, %fd133;
 	}
 	and.b32  	%r46, %r45, 2147483647;
 	setp.ne.s32	%p7, %r46, 2146435072;
-	@%p7 bra 	BB37_9;
+	@%p7 bra 	BB39_9;
 
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%r47, %temp}, %fd133;
 	}
 	setp.eq.s32	%p8, %r47, 0;
-	@%p8 bra 	BB37_10;
+	@%p8 bra 	BB39_10;
 
-BB37_9:
+BB39_9:
 	fma.rn.f64 	%fd133, %fd133, %fd5, %fd133;
 
-BB37_10:
+BB39_10:
 	st.param.f64	[func_retval0+0], %fd133;
 	ret;
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/hops/BinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/BinaryOp.java b/src/main/java/org/apache/sysml/hops/BinaryOp.java
index 6175621..9155203 100644
--- a/src/main/java/org/apache/sysml/hops/BinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/BinaryOp.java
@@ -23,24 +23,26 @@ import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.lops.Aggregate;
+import org.apache.sysml.lops.Append;
+import org.apache.sysml.lops.AppendG;
 import org.apache.sysml.lops.AppendGAlignedSP;
 import org.apache.sysml.lops.AppendM;
-import org.apache.sysml.lops.AppendCP;
-import org.apache.sysml.lops.AppendG;
 import org.apache.sysml.lops.AppendR;
 import org.apache.sysml.lops.Binary;
-import org.apache.sysml.lops.BinaryScalar;
 import org.apache.sysml.lops.BinaryM;
+import org.apache.sysml.lops.BinaryScalar;
 import org.apache.sysml.lops.BinaryUAggChain;
 import org.apache.sysml.lops.CentralMoment;
 import org.apache.sysml.lops.CoVariance;
 import org.apache.sysml.lops.CombineBinary;
+import org.apache.sysml.lops.CombineBinary.OperationTypes;
 import org.apache.sysml.lops.CombineUnary;
 import org.apache.sysml.lops.ConvolutionTransform;
 import org.apache.sysml.lops.Data;
 import org.apache.sysml.lops.DataPartition;
 import org.apache.sysml.lops.Group;
 import org.apache.sysml.lops.Lop;
+import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.lops.LopsException;
 import org.apache.sysml.lops.PartialAggregate;
 import org.apache.sysml.lops.PickByCount;
@@ -48,8 +50,6 @@ import org.apache.sysml.lops.RepMat;
 import org.apache.sysml.lops.SortKeys;
 import org.apache.sysml.lops.Unary;
 import org.apache.sysml.lops.UnaryCP;
-import org.apache.sysml.lops.CombineBinary.OperationTypes;
-import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat;
@@ -527,15 +527,20 @@ public class BinaryOp extends Hop
 			}
 			else //CP
 			{
+				if (DMLScript.USE_ACCELERATOR && dt1 == DataType.MATRIX && (DMLScript.FORCE_ACCELERATOR
+						|| getMemEstimate() < GPUContextPool.initialGPUMemBudget())) {
+					et = ExecType.GPU;
+				}
+
 				Lop offset = createOffsetLop( getInput().get(0), cbind ); //offset 1st input
-				append = new AppendCP(getInput().get(0).constructLops(), getInput().get(1).constructLops(), offset, getDataType(), getValueType(), cbind);
+				append = new Append(getInput().get(0).constructLops(), getInput().get(1).constructLops(), offset, getDataType(), getValueType(), cbind, et);
 				append.getOutputParameters().setDimensions(rlen, clen, getRowsInBlock(), getColsInBlock(), getNnz());
 			}
 		}
 		else //SCALAR-STRING and SCALAR-STRING (always CP)
 		{
-			append = new AppendCP(getInput().get(0).constructLops(), getInput().get(1).constructLops(), 
-				     Data.createLiteralLop(ValueType.INT, "-1"), getDataType(), getValueType(), cbind);
+			append = new Append(getInput().get(0).constructLops(), getInput().get(1).constructLops(),
+				     Data.createLiteralLop(ValueType.INT, "-1"), getDataType(), getValueType(), cbind, ExecType.CP);
 			append.getOutputParameters().setDimensions(0,0,-1,-1,-1);
 		}
 		

http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/lops/Append.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/Append.java b/src/main/java/org/apache/sysml/lops/Append.java
new file mode 100644
index 0000000..e224e51
--- /dev/null
+++ b/src/main/java/org/apache/sysml/lops/Append.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.lops;
+
+import org.apache.sysml.lops.LopProperties.ExecLocation;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.lops.compile.JobType;
+import org.apache.sysml.parser.Expression.*;
+
+
+public class Append extends Lop
+{
+	public static final String OPCODE = "append";
+
+	private boolean _cbind = true;
+	private ExecType _et;
+
+	public Append(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt, boolean cbind, ExecType et)
+	{
+		super(Lop.Type.Append, dt, vt);
+		_et = et;
+		init(input1, input2, input3, dt, vt);
+
+		_cbind = cbind;
+	}
+	
+	public void init(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt) 
+	{
+		addInput(input1);
+		input1.addOutput(this);
+
+		addInput(input2);
+		input2.addOutput(this);
+		
+		addInput(input3);
+		input3.addOutput(this);
+		
+		boolean breaksAlignment = false;
+		boolean aligner = false;
+		boolean definesMRJob = false;
+		
+		lps.addCompatibility(JobType.INVALID);
+		lps.setProperties( inputs, _et, ExecLocation.ControlProgram, breaksAlignment, aligner, definesMRJob );
+	}
+	
+	@Override
+	public String toString() {
+
+		return " Append: ";
+	}
+
+	//called when append executes in CP
+	public String getInstructions(String input1, String input2, String input3, String output) 
+		throws LopsException
+	{
+		StringBuilder sb = new StringBuilder();
+		sb.append( getExecType() );
+		sb.append( OPERAND_DELIMITOR );
+		sb.append( "append" );
+		
+		sb.append( OPERAND_DELIMITOR );
+		sb.append( getInputs().get(0).prepInputOperand(input1));
+		
+		sb.append( OPERAND_DELIMITOR );
+		sb.append( getInputs().get(1).prepInputOperand(input2));
+		
+		sb.append( OPERAND_DELIMITOR );
+		sb.append( getInputs().get(2).prepScalarInputOperand(getExecType()));
+		
+		sb.append( OPERAND_DELIMITOR );
+		sb.append( prepOutputOperand(output) );
+		
+		sb.append( OPERAND_DELIMITOR );
+		sb.append( _cbind );
+		
+		return sb.toString();
+	}
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/lops/AppendCP.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/AppendCP.java b/src/main/java/org/apache/sysml/lops/AppendCP.java
deleted file mode 100644
index e76f21e..0000000
--- a/src/main/java/org/apache/sysml/lops/AppendCP.java
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.lops;
-
-import org.apache.sysml.lops.LopProperties.ExecLocation;
-import org.apache.sysml.lops.LopProperties.ExecType;
-import org.apache.sysml.lops.compile.JobType;
-import org.apache.sysml.parser.Expression.*;
-
-
-public class AppendCP extends Lop
-{
-	public static final String OPCODE = "append";
-
-	private boolean _cbind = true;
-	
-	public AppendCP(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt, boolean cbind) 
-	{
-		super(Lop.Type.Append, dt, vt);
-		init(input1, input2, input3, dt, vt);
-		
-		_cbind = cbind;
-	}
-	
-	public void init(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt) 
-	{
-		addInput(input1);
-		input1.addOutput(this);
-
-		addInput(input2);
-		input2.addOutput(this);
-		
-		addInput(input3);
-		input3.addOutput(this);
-		
-		boolean breaksAlignment = false;
-		boolean aligner = false;
-		boolean definesMRJob = false;
-		
-		lps.addCompatibility(JobType.INVALID);
-		lps.setProperties( inputs, ExecType.CP, ExecLocation.ControlProgram, breaksAlignment, aligner, definesMRJob );
-	}
-	
-	@Override
-	public String toString() {
-
-		return " AppendCP: ";
-	}
-
-	//called when append executes in CP
-	public String getInstructions(String input1, String input2, String input3, String output) 
-		throws LopsException
-	{
-		StringBuilder sb = new StringBuilder();
-		sb.append( getExecType() );
-		sb.append( OPERAND_DELIMITOR );
-		sb.append( "append" );
-		
-		sb.append( OPERAND_DELIMITOR );
-		sb.append( getInputs().get(0).prepInputOperand(input1));
-		
-		sb.append( OPERAND_DELIMITOR );
-		sb.append( getInputs().get(1).prepInputOperand(input2));
-		
-		sb.append( OPERAND_DELIMITOR );
-		sb.append( getInputs().get(2).prepScalarInputOperand(getExecType()));
-		
-		sb.append( OPERAND_DELIMITOR );
-		sb.append( prepOutputOperand(output) );
-		
-		sb.append( OPERAND_DELIMITOR );
-		sb.append( _cbind );
-		
-		return sb.toString();
-	}
-}

http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index 7088c50..e755fa0 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -22,7 +22,7 @@ package org.apache.sysml.runtime.instructions;
 
 import java.util.HashMap;
 
-import org.apache.sysml.lops.AppendCP;
+import org.apache.sysml.lops.Append;
 import org.apache.sysml.lops.DataGen;
 import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.lops.UnaryCP;
@@ -239,7 +239,7 @@ public class CPInstructionParser extends InstructionParser
 		// User-defined function Opcodes
 		String2CPInstructionType.put( "extfunct"   	, CPINSTRUCTION_TYPE.External);
 
-		String2CPInstructionType.put( AppendCP.OPCODE, CPINSTRUCTION_TYPE.Append);
+		String2CPInstructionType.put( Append.OPCODE, CPINSTRUCTION_TYPE.Append);
 		
 		// data generation opcodes
 		String2CPInstructionType.put( DataGen.RAND_OPCODE   , CPINSTRUCTION_TYPE.Rand);

http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 17b1578..36f57b4 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -33,6 +33,7 @@ import org.apache.sysml.runtime.instructions.gpu.MMTSJGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.RelationalBinaryGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.ReorgGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.AggregateUnaryGPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.MatrixAppendGPUInstruction;
 
 public class GPUInstructionParser  extends InstructionParser 
 {
@@ -52,12 +53,15 @@ public class GPUInstructionParser  extends InstructionParser
 		String2GPUInstructionType.put( "bias_multiply",          GPUINSTRUCTION_TYPE.Convolution);
 
 		// Matrix Multiply Operators
-		String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary);
-		String2GPUInstructionType.put( "tsmm", GPUINSTRUCTION_TYPE.MMTSJ);
+		String2GPUInstructionType.put( "ba+*",  GPUINSTRUCTION_TYPE.AggregateBinary);
+		String2GPUInstructionType.put( "tsmm",  GPUINSTRUCTION_TYPE.MMTSJ);
 
 		// Reorg/Transpose
-		String2GPUInstructionType.put( "r'",   GPUINSTRUCTION_TYPE.Reorg);
-	
+		String2GPUInstructionType.put( "r'",    GPUINSTRUCTION_TYPE.Reorg);
+
+		// Matrix Manipulation
+		String2GPUInstructionType.put( "append", GPUINSTRUCTION_TYPE.Append);
+
 		// Binary Cellwise
 		String2GPUInstructionType.put( "+",    GPUINSTRUCTION_TYPE.ArithmeticBinary);
 		String2GPUInstructionType.put( "-",    GPUINSTRUCTION_TYPE.ArithmeticBinary);
@@ -161,7 +165,10 @@ public class GPUInstructionParser  extends InstructionParser
 
 			case BuiltinBinary:
 				return BuiltinBinaryGPUInstruction.parseInstruction(str);
-			
+
+			case Append:
+				return MatrixAppendGPUInstruction.parseInstruction(str);
+
 			case Convolution:
 				return ConvolutionGPUInstruction.parseInstruction(str);
 				

http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java
index 7529b05..e1c163d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java
@@ -43,7 +43,7 @@ public abstract class BuiltinUnaryGPUInstruction  extends GPUInstruction {
 		_gputype = GPUINSTRUCTION_TYPE.BuiltinUnary;
 		this._arity = _arity;
 		_input = in;
-    _output = out;
+		_output = out;
 	}
 
 	public int getArity() {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index 7f981eb..a5388cb 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -39,6 +39,7 @@ public abstract class GPUInstruction extends Instruction
 		Convolution,
 		MMTSJ,
 		Reorg,
+		Append,
 		ArithmeticBinary,
 		BuiltinUnary,
 		BuiltinBinary,
@@ -101,6 +102,8 @@ public abstract class GPUInstruction extends Instruction
 	public final static String MISC_TIMER_ACOS_KERNEL =                      "acosk";   // time spent in the acos kernel
 	public final static String MISC_TIMER_ATAN_KERNEL =                      "atank";   // time spent in the atan kernel
 	public final static String MISC_TIMER_SIGN_KERNEL =                      "signk";   // time spent in the sign kernel
+	public final static String MISC_TIMER_CBIND_KERNEL =                     "cbindk";  // time spent in the cbind kernel
+	public final static String MISC_TIMER_RBIND_KERNEL =                     "rbindk";  // time spent in the rbind kernel
 
 	public final static String MISC_TIMER_DAXPY_MV_KERNEL =                  "daxpymv";// time spent in the daxpy_matrix_vector kernel
 	public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL =   "u2lk";   // time spent in the copy_u2l_dense kernel

http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixAppendGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixAppendGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixAppendGPUInstruction.java
new file mode 100644
index 0000000..7671d7d
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixAppendGPUInstruction.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.instructions.gpu;
+
+import org.apache.sysml.parser.Expression;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.functionobjects.OffsetColumnIndex;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.instructions.cp.AppendCPInstruction;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.cp.FrameAppendCPInstruction;
+import org.apache.sysml.runtime.instructions.cp.MatrixAppendCPInstruction;
+import org.apache.sysml.runtime.instructions.cp.ScalarAppendCPInstruction;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.operators.Operator;
+import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
+import org.apache.sysml.utils.GPUStatistics;
+
+/**
+ * Implements the cbind and rbind functions for matrices
+ */
+public class MatrixAppendGPUInstruction extends GPUInstruction {
+
+	CPOperand output;
+	CPOperand input1, input2;
+	AppendCPInstruction.AppendType type;
+
+	public MatrixAppendGPUInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand out,  AppendCPInstruction.AppendType type, String opcode, String istr) {
+		super(op, opcode, istr);
+		this.output = out;
+		this.input1 = in1;
+		this.input2 = in2;
+		this.type = type;
+	}
+
+	public static MatrixAppendGPUInstruction parseInstruction ( String str )
+			throws DMLRuntimeException
+	{
+		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
+		InstructionUtils.checkNumFields (parts, 5);
+
+		String opcode = parts[0];
+		CPOperand in1 = new CPOperand(parts[1]);
+		CPOperand in2 = new CPOperand(parts[2]);
+		CPOperand in3 = new CPOperand(parts[3]);
+		CPOperand out = new CPOperand(parts[4]);
+		boolean cbind = Boolean.parseBoolean(parts[5]);
+
+		AppendCPInstruction.AppendType type = (in1.getDataType()!= Expression.DataType.MATRIX && in1.getDataType()!= Expression.DataType.FRAME) ?
+				AppendCPInstruction.AppendType.STRING : cbind ? AppendCPInstruction.AppendType.CBIND : AppendCPInstruction.AppendType.RBIND;
+
+		if (in1.getDataType()!= Expression.DataType.MATRIX || in2.getDataType()!= Expression.DataType.MATRIX){
+			throw new DMLRuntimeException("GPU : Error in internal state - Append was called on data other than matrices");
+		}
+
+		if(!opcode.equalsIgnoreCase("append"))
+			throw new DMLRuntimeException("Unknown opcode while parsing a AppendCPInstruction: " + str);
+
+		Operator op = new ReorgOperator(OffsetColumnIndex.getOffsetColumnIndexFnObject(-1));
+		return new MatrixAppendGPUInstruction(op, in1, in2, out, type, opcode, str);
+	}
+
+	@Override
+	public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
+		GPUStatistics.incrementNoOfExecutedGPUInst();
+
+		String opcode = getOpcode();
+		MatrixObject mat1 = getMatrixInputForGPUInstruction(ec, input1.getName());
+		MatrixObject mat2 = getMatrixInputForGPUInstruction(ec, input2.getName());
+
+		if(type == AppendCPInstruction.AppendType.CBIND) {
+			LibMatrixCUDA.cbind(ec, ec.getGPUContext(0), getExtendedOpcode(), mat1, mat2, output.getName());
+		} else if (type == AppendCPInstruction.AppendType.RBIND ) {
+			LibMatrixCUDA.rbind(ec, ec.getGPUContext(0), getExtendedOpcode(), mat1, mat2, output.getName());
+		} else {
+			throw new DMLRuntimeException("Unsupported GPU operator:" + opcode);
+		}
+		ec.releaseMatrixInputForGPUInstruction(input1.getName());
+		ec.releaseMatrixInputForGPUInstruction(input2.getName());
+		ec.releaseMatrixOutputForGPUInstruction(output.getName());
+	}
+}