You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/02/08 19:15:31 UTC

[1/3] incubator-systemml git commit: [SYSTEMML-1039] Added variance, row/col variance

Repository: incubator-systemml
Updated Branches:
  refs/heads/master f8d707788 -> ad009d81f


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ad009d81/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
index 210d4fb..23e1da4 100644
--- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
@@ -151,6 +151,7 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop
 										|| (_op == AggOp.MAX 			&& (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
 										|| (_op == AggOp.MIN 			&& (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
 										|| (_op == AggOp.MEAN 		&& (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
+										|| (_op == AggOp.VAR 		&& (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
 										|| (_op == AggOp.PROD 		&& (_direction == Direction.RowCol))){
 							et = ExecType.GPU;
 							k = 1;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ad009d81/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index b54d020..3dbdb1e 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -92,7 +92,9 @@ public class GPUInstructionParser  extends InstructionParser
 		String2GPUInstructionType.put( "uasqk+"	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Sum of Squares
 		String2GPUInstructionType.put( "uarsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary);	// Row Sum of Squares
 		String2GPUInstructionType.put( "uacsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary);	// Col Sum of Squares
-
+		String2GPUInstructionType.put( "uavar" 	 , GPUINSTRUCTION_TYPE.AggregateUnary);		// Variance
+		String2GPUInstructionType.put( "uarvar"  , GPUINSTRUCTION_TYPE.AggregateUnary);	// Row Variance
+		String2GPUInstructionType.put( "uacvar"  , GPUINSTRUCTION_TYPE.AggregateUnary);	// Col Variance
 	}
 	
 	public static GPUInstruction parseSingleInstruction (String str ) 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ad009d81/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
index 2443752..b7c0e99 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
@@ -71,7 +71,7 @@ public class MatrixMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns
 		ec.setMetaData(_output.getName(), (int)rlen, (int)clen);
 		
 		BinaryOperator bop = (BinaryOperator) _optr;
-		LibMatrixCUDA.bincellOp(ec, in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop);
+		LibMatrixCUDA.matrixScalarArithmetic(ec, in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop);
 		
 		ec.releaseMatrixInputForGPUInstruction(_input1.getName());
 		ec.releaseMatrixInputForGPUInstruction(_input2.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ad009d81/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
index b1029ee..78b480e 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
@@ -60,7 +60,7 @@ public class ScalarMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns
 		ScalarOperator sc_op = (ScalarOperator) _optr;
 		sc_op.setConstant(constant.getDoubleValue());
 		
-		LibMatrixCUDA.bincellOp(ec, in1, _output.getName(), isTransposed, sc_op);
+		LibMatrixCUDA.matrixScalarArithmetic(ec, in1, _output.getName(), isTransposed, sc_op);
 		
 		ec.releaseMatrixInputForGPUInstruction(mat.getName());
         ec.releaseMatrixOutputForGPUInstruction(_output.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ad009d81/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 542ed97..9d3220b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -84,6 +84,8 @@ import jcuda.jcudnn.cudnnTensorDescriptor;
 import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseHandle;
 
+import java.util.Vector;
+
 //FIXME move could to respective instructions, this is not a block library
 public class LibMatrixCUDA {
 
@@ -1121,7 +1123,6 @@ public class LibMatrixCUDA {
 						reduceCol("reduce_col_sum", in, out, rlen, clen);
 						break;
 					}
-
 					case REDUCTION_DIAG :
 						throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
 				}
@@ -1130,8 +1131,7 @@ public class LibMatrixCUDA {
 			case OP_PLUS_SQ : {
 				// Calculate the squares in a temporary object tmp
 				Pointer tmp = JCudaObject.allocate(size * Sizeof.DOUBLE);
-				ScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2);
-				binCellOpHelper(in, 2, rlen, clen, tmp, power2op);
+				squareMatrix(in, tmp, rlen, clen);
 				// Then do the sum on the temporary object and free it
 				switch(reductionDirection) {
 					case REDUCTION_ALL : {
@@ -1227,15 +1227,68 @@ public class LibMatrixCUDA {
 				break;
 			}
 			case OP_VARIANCE : {
+				// Temporary GPU array for
+				Pointer tmp = JCudaObject.allocate(size * Sizeof.DOUBLE);
+				Pointer tmp2 = JCudaObject.allocate(size * Sizeof.DOUBLE);
+
 				switch(reductionDirection) {
-					case REDUCTION_ALL:
-					case REDUCTION_COL:
-					case REDUCTION_ROW:
-						throw new DMLRuntimeException("Internal Error - All, Row & Column variance of matrix not implemented yet for GPU ");
+
+					case REDUCTION_ALL: {
+						double result = reduceAll("reduce_sum", in, size);
+						double mean = result / size;
+
+						// Subtract mean from every element in the matrix
+						ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
+						matrixScalarOp(in, mean, rlen, clen, tmp, minusOp);
+
+						squareMatrix(tmp, tmp2, rlen, clen);
+
+						double result2 = reduceAll("reduce_sum", tmp2, size);
+						double variance = result2 / (size - 1);
+						ec.setScalarOutput(output, new DoubleObject(variance));
+
+						break;
+					}
+					case REDUCTION_COL: {
+						reduceRow("reduce_row_mean", in, out, rlen, clen);
+						// Subtract the row-wise mean from every element in the matrix
+						BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
+						matrixMatrixOp(in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
+
+						squareMatrix(tmp, tmp2, rlen, clen);
+
+						Pointer tmpRow = JCudaObject.allocate(rlen * Sizeof.DOUBLE);
+						reduceRow("reduce_row_sum", tmp2, tmpRow, rlen, clen);
+
+						ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
+						matrixScalarOp(tmpRow, clen - 1, rlen, clen, out, divideOp);
+
+						cudaFree(tmpRow);
+						break;
+					}
+					case REDUCTION_ROW: {
+						reduceCol("reduce_col_mean", in, out, rlen, clen);
+						// Subtract the columns-wise mean from every element in the matrix
+						BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
+						matrixMatrixOp(in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
+
+						squareMatrix(tmp, tmp2, rlen, clen);
+
+						Pointer tmpCol = JCudaObject.allocate(clen * Sizeof.DOUBLE);
+						reduceCol("reduce_col_sum", tmp2, tmpCol, rlen, clen);
+
+						ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
+						matrixScalarOp(tmpCol, rlen - 1, rlen, clen, out, divideOp);
+
+						cudaFree(tmpCol);
+						break;
+					}
 					default:
 						throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
 				}
-				// break;
+				cudaFree(tmp);
+				cudaFree(tmp2);
+				break;
 			}
 			case OP_MAXINDEX : {
 				switch(reductionDirection) {
@@ -1260,6 +1313,19 @@ public class LibMatrixCUDA {
 	}
 
 	/**
+	 * Helper method to square a matrix in GPU memory
+	 * @param in		input matrix on GPU
+	 * @param out		output matrix on GPU
+	 * @param rlen	row length
+	 * @param clen	column length
+	 * @throws DMLRuntimeException
+	 */
+	private static void squareMatrix(Pointer in, Pointer out, int rlen, int clen) throws DMLRuntimeException {
+		ScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2);
+		matrixScalarOp(in, 2, rlen, clen, out, power2op);
+	}
+
+	/**
 	 * Do a simple reduction, the output of which is a single value
 	 * @param kernelFunction 	name of the kernel function to invoke
 	 * @param in							{@link Pointer} to matrix in device memory
@@ -1275,9 +1341,9 @@ public class LibMatrixCUDA {
 		kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem),
 						in, tempOut, n);
 		cudaDeviceSynchronize();
-		int s = n;
+		int s = blocks;
 		while (s > 1) {
-			tmp = getKernelParamsForReduceAll(n);
+			tmp = getKernelParamsForReduceAll(s);
 			blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2];
 			kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem),
 							tempOut, tempOut, s);
@@ -1652,7 +1718,7 @@ public class LibMatrixCUDA {
 	}
 
 	/**
-	 * Performs elementwise matrix-scalar operation specified by op
+	 * Entry point to perform elementwise matrix-scalar operation specified by op
 	 *
 	 * @param ec execution context
 	 * @param in input matrix
@@ -1661,7 +1727,7 @@ public class LibMatrixCUDA {
 	 * @param op scalar operator
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static void bincellOp(ExecutionContext ec, MatrixObject in, String outputName, boolean isInputTransposed, ScalarOperator op) throws DMLRuntimeException {
+	public static void matrixScalarArithmetic(ExecutionContext ec, MatrixObject in, String outputName, boolean isInputTransposed, ScalarOperator op) throws DMLRuntimeException {
 		double constant = op.getConstant();
 		boolean isCUDALibAvailable = (op.fn instanceof Multiply
 						|| (op.fn instanceof Divide && op instanceof RightScalarOperator && constant != 0)) && !isSparseAndEmpty(in);
@@ -1685,7 +1751,7 @@ public class LibMatrixCUDA {
 				}
 				else {
 					// TODO: Potential to optimize
-					launchBinCellOpKernel(ec, in, outputName, isInputTransposed, op);
+					matrixScalarOp(ec, in, outputName, isInputTransposed, op);
 				}
 			}
 			else if(constant == 1.0 && op.fn instanceof Or) {
@@ -1695,7 +1761,7 @@ public class LibMatrixCUDA {
 				deviceCopy(ec, in, outputName, isInputTransposed);
 			}
 			else {
-				launchBinCellOpKernel(ec, in, outputName, isInputTransposed, op);
+				matrixScalarOp(ec, in, outputName, isInputTransposed, op);
 			}
 		}
 		else {
@@ -1717,7 +1783,7 @@ public class LibMatrixCUDA {
 	}
 
 	/**
-	 * Utility to launch binCellScalarOp kernel
+	 * Utility to do matrix-scalar operation kernel
 	 *
 	 * @param ec execution context
 	 * @param in input matrix
@@ -1726,8 +1792,8 @@ public class LibMatrixCUDA {
 	 * @param op operator
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	private static void launchBinCellOpKernel(ExecutionContext ec, MatrixObject in, String outputName, boolean isInputTransposed,
-																						ScalarOperator op) throws DMLRuntimeException {
+	private static void matrixScalarOp(ExecutionContext ec, MatrixObject in, String outputName, boolean isInputTransposed,
+																		 ScalarOperator op) throws DMLRuntimeException {
 		if(isInputTransposed)
 			throw new DMLRuntimeException("Transposing the input is not supported");
 
@@ -1742,8 +1808,7 @@ public class LibMatrixCUDA {
 		MatrixObject out = ec.getMatrixObject(outputName);
 		ec.getDenseMatrixOutputForGPUInstruction(outputName);	// Allocated the dense output matrix
 		Pointer C = ((JCudaObject)out.getGPUObject()).jcudaDenseMatrixPtr;
-		// Invokes binCellScalarOp(double* A, double scalar, double* C, int rlenA, int clenA, int op, int isLeftScalar)
-		binCellOpHelper(A, scalar, rlenA, clenA, C, op);
+		matrixScalarOp(A, scalar, rlenA, clenA, C, op);
 	}
 
 	/**
@@ -1758,15 +1823,16 @@ public class LibMatrixCUDA {
 	 * @param op				operation to perform
 	 * @throws DMLRuntimeException throws runtime exception
 	 */
-	private static void binCellOpHelper(Pointer a, double scalar, int rlenA, int clenA, Pointer c, ScalarOperator op) throws DMLRuntimeException {
+	private static void matrixScalarOp(Pointer a, double scalar, int rlenA, int clenA, Pointer c, ScalarOperator op) throws DMLRuntimeException {
 		int isLeftScalar = (op instanceof LeftScalarOperator) ? 1 : 0;
-		kernels.launchKernel("binCellScalarOp",
-						ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA),
-						a, scalar, c, rlenA, clenA, getBinaryOp(op.fn), isLeftScalar);
+    int size = rlenA * clenA;
+		kernels.launchKernel("matrix_scalar_op",
+						ExecutionConfig.getConfigForSimpleVectorOperations(size),
+						a, scalar, c, size, getBinaryOp(op.fn), isLeftScalar);
 	}
 
 	/**
-	 * Utility to launch binCellOp kernel
+	 * Utility to launch binary cellwise matrix-matrix operations CUDA kernel
 	 *
 	 * @param ec execution context
 	 * @param in1 left input matrix
@@ -1777,8 +1843,8 @@ public class LibMatrixCUDA {
 	 * @param op operator
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	private static void launchBinCellOpKernel(ExecutionContext ec, MatrixObject in1, MatrixObject in2,
-																						String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
+	private static void matrixMatrixOp(ExecutionContext ec, MatrixObject in1, MatrixObject in2,
+																		 String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
 
 		boolean isSparse1 = isInSparseFormat(in1);
 		boolean isEmpty1 = isSparseAndEmpty(in1);
@@ -1788,8 +1854,8 @@ public class LibMatrixCUDA {
 		int rlenB = (int) in2.getNumRows();
 		int clenA = (int) in1.getNumColumns();
 		int clenB = (int) in2.getNumColumns();
-		int vecStatusA = getVectorStatus(in1);
-		int vecStatusB = getVectorStatus(in2);
+		int vecStatusA = getVectorStatus(rlenA, clenA).code();
+		int vecStatusB = getVectorStatus(rlenB, clenB).code();
 
 		if (isEmpty1 && isEmpty2){
 			MatrixObject out = ec.getMatrixObject(outputName);
@@ -1804,12 +1870,12 @@ public class LibMatrixCUDA {
 		// Check for M1 * M2 when M1 is empty; if M2 is a vector then fallback to general case
 		else if(isEmpty1 && clenB != 1 && rlenB != 1) {
 			// C = empty_in1 op in2 ==> becomes ==> C = 0.0 op in2
-			bincellOp(ec, in2, outputName, isRightTransposed, new LeftScalarOperator(op.fn, 0.0));
+			matrixScalarArithmetic(ec, in2, outputName, isRightTransposed, new LeftScalarOperator(op.fn, 0.0));
 		}
 		// Check for M1 * M2 when M2 is empty; if M1 is a vector then fallback to general case
 		else if(isEmpty2 && clenA != 1 && rlenA != 1) {
 			// C = in1 op empty_in2 ==> becomes ==> C = in1 op 0.0
-			bincellOp(ec, in1, outputName, isLeftTransposed, new RightScalarOperator(op.fn, 0.0));
+			matrixScalarArithmetic(ec, in1, outputName, isLeftTransposed, new RightScalarOperator(op.fn, 0.0));
 		}
 		else {
 			if(isSparse1) {
@@ -1830,21 +1896,60 @@ public class LibMatrixCUDA {
 			int maxRlen = Math.max(rlenA, rlenB);
 			int maxClen = Math.max(clenA, clenB);
 
-			kernels.launchKernel("binCellOp",
-							ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen),
-							A, B, C, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn));
+			matrixMatrixOp(A, B, maxRlen, maxClen, vecStatusA, vecStatusB, C, op);
 		}
 	}
 
-	private static int getVectorStatus(MatrixObject in) {
-		long rows = in.getNumRows();
-		long cols = in.getNumColumns();
+	/**
+	 * Do an elementwise matrix-matrix arithmetic operation on the GPU
+	 * c = a op b
+	 * Either rows and cols in A are the same as in B or
+	 * one of them is a vector or both are.
+	 * @param a						The input matrix a allocated on the GPU
+	 * @param b						The input matrix b allocated on the GPU
+	 * @param maxRlen			the maximum of the row lengths between a & b
+	 * @param maxClen			the maximum of the column lengths between a & b
+	 * @param vecStatusA	if matrix A is a vector
+	 * @param vecStatusB	if matrix B is a vector
+	 * @param c						output matrix of size (maxRlen, maxClen) allocated on GPU
+	 * @param op					the operation to perform
+	 * @throws DMLRuntimeException
+	 */
+	private static void matrixMatrixOp(Pointer a, Pointer b, int maxRlen, int maxClen, int vecStatusA, int vecStatusB, Pointer c, BinaryOperator op) throws DMLRuntimeException {
+		kernels.launchKernel("matrix_matrix_cellwise_op",
+            ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen),
+						a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn));
+	}
+
+	/**
+	 * This enum declares the different vector shapes
+	 * as they recognized in the invoked CUDA kernel(s).
+	 */
+	enum VectorShape {
+		COLUMN 	(1),
+		ROW 		(2),
+		NONE 		(0);
+		private final int code;
+		VectorShape(int code) {
+			this.code = code;
+		}
+		int code() { return code; }
+	}
+
+	/**
+	 * Given the number of rows and columns, returns
+	 * whether this is a row vector, column vector or neither.
+	 * @param rows
+	 * @param cols
+	 * @return 1 for column vector, 2 for row vector, 0 for neither
+	 */
+	private static VectorShape getVectorStatus(long rows, long cols) {
 		if(cols == 1)
-			return 1;
+			return VectorShape.COLUMN;
 		else if(rows == 1)
-			return 2;
+			return VectorShape.ROW;
 		else
-			return 0;
+			return VectorShape.NONE;
 	}
 
 	private static boolean isSparseAndEmpty(MatrixObject in1) {
@@ -1861,7 +1966,7 @@ public class LibMatrixCUDA {
 	}
 
 	/**
-	 * Performs a deep device copy of input matrix
+	 * Performs a deep device copy of a matrix on the GPU
 	 *
 	 * @param ec execution context
 	 * @param src source matrix
@@ -1899,8 +2004,7 @@ public class LibMatrixCUDA {
 						A, ret, rlen, clen, compareVal, tolerance, ifEqualsVal, ifLessThanVal, ifGreaterThanVal);
 	}
 
-	/**
-	 */
+
 	private static void setOutputToConstant(ExecutionContext ec, double constant, String outputName) throws DMLRuntimeException {
 		if(constant == 0) {
 			// TODO: Create sparse empty block instead
@@ -1980,11 +2084,11 @@ public class LibMatrixCUDA {
 	 * @param op binary operator
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static void bincellOp(ExecutionContext ec, MatrixObject in1, MatrixObject in2,
-															 String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
+	public static void matrixScalarArithmetic(ExecutionContext ec, MatrixObject in1, MatrixObject in2,
+																						String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
 		boolean isCUDALibAvailable = (op.fn instanceof Plus || op.fn instanceof Minus) && !isSparseAndEmpty(in1) && !isSparseAndEmpty(in2) && !isVector(in1) && !isVector(in2);
 		if(!isCUDALibAvailable) {
-			launchBinCellOpKernel(ec, in1, in2, outputName, isLeftTransposed, isRightTransposed, op);
+			matrixMatrixOp(ec, in1, in2, outputName, isLeftTransposed, isRightTransposed, op);
 		}
 		else {
 			double alpha;
@@ -2009,9 +2113,14 @@ public class LibMatrixCUDA {
 		return in.getNumRows() == 1 || in.getNumColumns() == 1;
 	}
 
-	// op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power,
-	// 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
-	// 11=min, 12=max, 13=and, 14=or, 15=log}
+	/**
+	 * Helper function to get numeric value for binary op.
+	 * This number is passed down to the CUDA kernel
+	 * and the appropriate binary operation is performed on the GPU.
+	 * op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power,
+	 * 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
+	 * 11=min, 12=max, 13=and, 14=or, 15=log}
+	 */
 	private static int getBinaryOp(ValueFunction fn) throws DMLRuntimeException {
 		if(fn instanceof Plus) return 0;
 		else if(fn instanceof Minus) return 1;
@@ -2116,4 +2225,24 @@ public class LibMatrixCUDA {
 		// = 1.0 * A^T + 0.0 * A^T
 		dgeam(ec, in, in, outputName, true, true, 1.0, 0.0);
 	}
+
+	/**
+	 * Convenience method for debugging matrices on the GPU.
+	 * @param in		Pointer to a double array (matrix) on the GPU
+	 * @param rlen	row length
+	 * @param clen	column length
+	 */
+	private static void debugPrintMatrix(Pointer in, int rlen, int clen){
+		double[] data = new double[rlen * clen];
+		cudaMemcpy(Pointer.to(data), in, rlen*clen*Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
+		int k=0;
+		for (int i=0; i<rlen; ++i){
+			for (int j=0; j<clen; ++j){
+				System.out.print(data[k]);
+				k++;
+			}
+			System.out.println();
+		}
+	}
+
 }


[3/3] incubator-systemml git commit: [SYSTEMML-1039] Added variance, row/col variance

Posted by ni...@apache.org.
[SYSTEMML-1039] Added variance, row/col variance

Closes #383.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/ad009d81
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/ad009d81
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/ad009d81

Branch: refs/heads/master
Commit: ad009d81f759caed7ed134771fc6236d7cf21866
Parents: f8d7077
Author: Nakul Jindal <na...@gmail.com>
Authored: Wed Feb 8 11:14:43 2017 -0800
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Wed Feb 8 11:14:43 2017 -0800

----------------------------------------------------------------------
 src/main/cpp/kernels/SystemML.cu                |  105 +-
 src/main/cpp/kernels/SystemML.ptx               | 2772 ++++++++----------
 .../java/org/apache/sysml/hops/AggUnaryOp.java  |    1 +
 .../instructions/GPUInstructionParser.java      |    4 +-
 .../MatrixMatrixArithmeticGPUInstruction.java   |    2 +-
 .../ScalarMatrixArithmeticGPUInstruction.java   |    2 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |  225 +-
 7 files changed, 1479 insertions(+), 1632 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ad009d81/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 4ce6fb2..cda28ba 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -40,7 +40,7 @@ __global__ void copyUpperToLowerTriangleDense(double* ret, int dim, int N) {
 }
 
 extern "C"
-__device__ double getBoolean(int val) {
+__forceinline__ __device__ double getBoolean(int val) {
 	if(val == 0)
 		return 0.0;
 	else
@@ -51,39 +51,23 @@ __device__ double getBoolean(int val) {
 // 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
 // 11=min, 12=max, 13=and, 14=or, 15=log}
 extern "C"
-__device__ double binaryOp(double x, double y, int op) {
-	// 0=plus, 1=minus, 2=multiply, 3=divide, 4=power
-	if(op == 0)
-		return x + y;
-	else if(op == 1)
-		return x - y;
-	else if(op == 2)
-		return x * y;
-	else if(op == 3)
-		return x / y;
-	else if(op == 4)
-		return pow(x, y);
-	// 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
-	else if(op == 5)
-		return getBoolean(x < y);
-	else if(op == 6)
-		return getBoolean(x <= y);
-	else if(op == 7)
-		return getBoolean(x > y);
-	else if(op == 8)
-		return getBoolean(x >= y);
-	else if(op == 9)
-		return getBoolean(x == y);
-	else if(op == 10)
-		return getBoolean(x != y);
-	// 11=min, 12=max, 13=and, 14=or, 15=log
-	else if(op == 11) {
-		return min(x, y);
-	}
-	else if(op == 12) {
-		return max(x, y);
-	}
-	return -999;
+__forceinline__ __device__ double binaryOp(double x, double y, int op) {
+	switch(op) {
+        case 0 : return x + y;
+        case 1 : return x - y;
+        case 2 : return x * y;
+        case 3 : return x / y;
+        case 4 : return pow(x, y);
+        case 5 : return getBoolean(x < y);
+        case 6 : return getBoolean(x <= y);
+        case 7 : return getBoolean(x > y);
+        case 8 : return getBoolean(x >= y);
+        case 9 : return getBoolean(x == y);
+        case 10 : return getBoolean(x != y);
+        case 11 : return min(x, y);
+        case 12 : return max(x, y);
+        default : return DBL_MAX;
+    }
 }
 
 extern "C"
@@ -158,8 +142,22 @@ __global__ void compareAndSet(double* A,  double* ret, int rlen, int clen, doubl
 	}
 }
 
+
+/**
+ * Performs a binary cellwise arithmetic operation on 2 matrices.
+ * Either both matrices are of equal size or one of them is a vector or both are.
+ * @param A                 first input matrix allocated on GPU
+ * @param B                 second input matrix allocated on GPU
+ * @param C                 output allocated on GPU
+ * @param maxRlen           maximum of the row lengths of A and B
+ * @param maxClen           maximum of the column lengths of A and B
+ * @param vectorAStatus     if A is a row vector, column vector or neither
+ * @param vectorBStatus     if B is a row vector, column vector or neither
+ * @param op                the numeric code of the arithmetic operation to perform
+ *
+ */
 extern "C"
-__global__ void binCellOp(double* A, double* B, double* C,
+__global__ void matrix_matrix_cellwise_op(double* A, double* B, double* C,
 	int maxRlen, int maxClen, int vectorAStatus, int vectorBStatus, int op) {
 	int ix = blockIdx.x * blockDim.x + threadIdx.x;
 	int iy = blockIdx.y * blockDim.y + threadIdx.y;
@@ -177,21 +175,32 @@ __global__ void binCellOp(double* A, double* B, double* C,
 		else if(vectorBStatus == 2)
 			bIndex = iy; // rlen == 1
 		C[outIndex] = binaryOp(A[aIndex], B[bIndex], op);
-		// printf("C[%d] = A[%d](%f) B[%d](%f) (%d %d)\n", outIndex, aIndex, A[aIndex], bIndex,  B[bIndex], (ix+1), (iy+1));
+		//printf("C[%d] = A[%d](%f) B[%d](%f) (%d %d)\n", outIndex, aIndex, A[aIndex], bIndex,  B[bIndex], (ix+1), (iy+1));
+    __syncthreads();
 	}
 }
 
+/**
+ * Performs an arithmetic operation between a matrix and a scalar.
+ * C = s op A or C = A op s (where A is the matrix, s is the scalar and op is the operation)
+ * @param A             input matrix allocated on GPU
+ * @param scalar        scalar input
+ * @param C             output matrix allocated on GPU
+ * @param size          number of elements in matrix A
+ * @param op            number code of the arithmetic operation to perform
+ * @param isLeftScalar  whether the scalar is on the left side
+ */
 extern "C"
-__global__ void binCellScalarOp(double* A, double scalar, double* C, int rlenA, int clenA, int op, int isLeftScalar) {
-	int ix = blockIdx.x * blockDim.x + threadIdx.x;
-	int iy = blockIdx.y * blockDim.y + threadIdx.y;
-	int index = ix * clenA + iy;
-	if(index < rlenA*clenA) {
-		if(isLeftScalar)
+__global__ void matrix_scalar_op(double* A, double scalar, double* C, int size, int op, int isLeftScalar) {
+	int index = blockIdx.x *blockDim.x + threadIdx.x;
+	if(index < size) {
+		if(isLeftScalar) {
 			C[index] = binaryOp(scalar, A[index], op);
-		else
+		} else {
 			C[index] = binaryOp(A[index], scalar, op);
+    }
 	}
+  __syncthreads();
 }
 
 
@@ -475,7 +484,7 @@ typedef struct {
 extern "C"
 __global__ void reduce_max(double *g_idata, double *g_odata, unsigned int n){
     MaxOp op;
-    reduce<MaxOp>(g_idata, g_odata, n, op, DBL_MIN);
+    reduce<MaxOp>(g_idata, g_odata, n, op, -DBL_MAX);
 }
 
 /**
@@ -489,7 +498,7 @@ extern "C"
 __global__ void reduce_row_max(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
     MaxOp op;
     IdentityOp aop;
-    reduce_row<MaxOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, DBL_MIN);
+    reduce_row<MaxOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, -DBL_MAX);
 }
 
 /**
@@ -503,7 +512,7 @@ extern "C"
 __global__ void reduce_col_max(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
     MaxOp op;
     IdentityOp aop;
-    reduce_col<MaxOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, DBL_MIN);
+    reduce_col<MaxOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, -DBL_MAX);
 }
 
 /**
@@ -602,7 +611,7 @@ struct MeanOp {
 extern "C"
 __global__ void reduce_row_mean(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
     SumOp op;
-    MeanOp aop(rows*cols);
+    MeanOp aop(cols);
     reduce_row<SumOp, MeanOp>(g_idata, g_odata, rows, cols, op, aop, 0.0);
 }
 
@@ -616,6 +625,6 @@ __global__ void reduce_row_mean(double *g_idata, double *g_odata, unsigned int r
 extern "C"
 __global__ void reduce_col_mean(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
     SumOp op;
-    MeanOp aop(rows*cols);
+    MeanOp aop(rows);
     reduce_col<SumOp, MeanOp>(g_idata, g_odata, rows, cols, op, aop, 0.0);
 }


[2/3] incubator-systemml git commit: [SYSTEMML-1039] Added variance, row/col variance

Posted by ni...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ad009d81/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index dfff5dd..93f3879 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -10,7 +10,7 @@
 .target sm_30
 .address_size 64
 
-	// .globl	getBoolean
+	// .globl	_Z6reduceI5SumOpEvPdS1_jT_d
 .func  (.param .b64 func_retval0) __internal_accurate_pow
 (
 	.param .b64 __internal_accurate_pow_param_0,
@@ -19,307 +19,6 @@
 ;
 .extern .shared .align 8 .b8 sdata[];
 
-.visible .func  (.param .b64 func_retval0) getBoolean(
-	.param .b32 getBoolean_param_0
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<2>;
-	.reg .f64 	%fd<2>;
-
-
-	ld.param.u32 	%r1, [getBoolean_param_0];
-	setp.eq.s32	%p1, %r1, 0;
-	selp.f64	%fd1, 0d0000000000000000, 0d3FF0000000000000, %p1;
-	st.param.f64	[func_retval0+0], %fd1;
-	ret;
-}
-
-	// .globl	binaryOp
-.visible .func  (.param .b64 func_retval0) binaryOp(
-	.param .b64 binaryOp_param_0,
-	.param .b64 binaryOp_param_1,
-	.param .b32 binaryOp_param_2
-)
-{
-	.reg .pred 	%p<41>;
-	.reg .b32 	%r<30>;
-	.reg .f64 	%fd<40>;
-	.reg .b64 	%rd<3>;
-
-
-	ld.param.f64 	%fd26, [binaryOp_param_0];
-	ld.param.f64 	%fd27, [binaryOp_param_1];
-	ld.param.u32 	%r3, [binaryOp_param_2];
-	setp.eq.s32	%p2, %r3, 0;
-	@%p2 bra 	BB1_40;
-
-	setp.eq.s32	%p3, %r3, 1;
-	@%p3 bra 	BB1_39;
-	bra.uni 	BB1_2;
-
-BB1_39:
-	sub.f64 	%fd39, %fd26, %fd27;
-	bra.uni 	BB1_41;
-
-BB1_40:
-	add.f64 	%fd39, %fd26, %fd27;
-	bra.uni 	BB1_41;
-
-BB1_2:
-	setp.eq.s32	%p4, %r3, 2;
-	@%p4 bra 	BB1_38;
-	bra.uni 	BB1_3;
-
-BB1_38:
-	mul.f64 	%fd39, %fd26, %fd27;
-	bra.uni 	BB1_41;
-
-BB1_3:
-	setp.eq.s32	%p5, %r3, 3;
-	@%p5 bra 	BB1_37;
-	bra.uni 	BB1_4;
-
-BB1_37:
-	div.rn.f64 	%fd39, %fd26, %fd27;
-	bra.uni 	BB1_41;
-
-BB1_4:
-	setp.eq.s32	%p6, %r3, 4;
-	@%p6 bra 	BB1_21;
-	bra.uni 	BB1_5;
-
-BB1_21:
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r1}, %fd26;
-	}
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r2}, %fd27;
-	}
-	bfe.u32 	%r4, %r2, 20, 11;
-	add.s32 	%r5, %r4, -1012;
-	mov.b64 	 %rd2, %fd27;
-	shl.b64 	%rd1, %rd2, %r5;
-	setp.eq.s64	%p21, %rd1, -9223372036854775808;
-	abs.f64 	%fd9, %fd26;
-	// Callseq Start 0
-	{
-	.reg .b32 temp_param_reg;
-	// <end>}
-	.param .b64 param0;
-	st.param.f64	[param0+0], %fd9;
-	.param .b64 param1;
-	st.param.f64	[param1+0], %fd27;
-	.param .b64 retval0;
-	call.uni (retval0), 
-	__internal_accurate_pow, 
-	(
-	param0, 
-	param1
-	);
-	ld.param.f64	%fd38, [retval0+0];
-	
-	//{
-	}// Callseq End 0
-	setp.lt.s32	%p22, %r1, 0;
-	and.pred  	%p1, %p22, %p21;
-	@!%p1 bra 	BB1_23;
-	bra.uni 	BB1_22;
-
-BB1_22:
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r6}, %fd38;
-	}
-	xor.b32  	%r7, %r6, -2147483648;
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%r8, %temp}, %fd38;
-	}
-	mov.b64 	%fd38, {%r8, %r7};
-
-BB1_23:
-	mov.f64 	%fd37, %fd38;
-	setp.eq.f64	%p23, %fd26, 0d0000000000000000;
-	@%p23 bra 	BB1_26;
-	bra.uni 	BB1_24;
-
-BB1_26:
-	selp.b32	%r9, %r1, 0, %p21;
-	or.b32  	%r10, %r9, 2146435072;
-	setp.lt.s32	%p27, %r2, 0;
-	selp.b32	%r11, %r10, %r9, %p27;
-	mov.u32 	%r12, 0;
-	mov.b64 	%fd37, {%r12, %r11};
-	bra.uni 	BB1_27;
-
-BB1_5:
-	setp.eq.s32	%p7, %r3, 5;
-	@%p7 bra 	BB1_20;
-	bra.uni 	BB1_6;
-
-BB1_20:
-	setp.lt.f64	%p20, %fd26, %fd27;
-	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p20;
-	bra.uni 	BB1_41;
-
-BB1_6:
-	setp.eq.s32	%p8, %r3, 6;
-	@%p8 bra 	BB1_19;
-	bra.uni 	BB1_7;
-
-BB1_19:
-	setp.le.f64	%p19, %fd26, %fd27;
-	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p19;
-	bra.uni 	BB1_41;
-
-BB1_24:
-	setp.gt.s32	%p24, %r1, -1;
-	@%p24 bra 	BB1_27;
-
-	cvt.rzi.f64.f64	%fd29, %fd27;
-	setp.neu.f64	%p25, %fd29, %fd27;
-	selp.f64	%fd37, 0dFFF8000000000000, %fd37, %p25;
-
-BB1_27:
-	mov.f64 	%fd15, %fd37;
-	add.f64 	%fd16, %fd26, %fd27;
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r13}, %fd16;
-	}
-	and.b32  	%r14, %r13, 2146435072;
-	setp.ne.s32	%p28, %r14, 2146435072;
-	mov.f64 	%fd36, %fd15;
-	@%p28 bra 	BB1_36;
-
-	setp.gtu.f64	%p29, %fd9, 0d7FF0000000000000;
-	mov.f64 	%fd36, %fd16;
-	@%p29 bra 	BB1_36;
-
-	abs.f64 	%fd30, %fd27;
-	setp.gtu.f64	%p30, %fd30, 0d7FF0000000000000;
-	mov.f64 	%fd35, %fd16;
-	mov.f64 	%fd36, %fd35;
-	@%p30 bra 	BB1_36;
-
-	and.b32  	%r15, %r2, 2147483647;
-	setp.ne.s32	%p31, %r15, 2146435072;
-	@%p31 bra 	BB1_32;
-
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%r16, %temp}, %fd27;
-	}
-	setp.eq.s32	%p32, %r16, 0;
-	@%p32 bra 	BB1_35;
-
-BB1_32:
-	and.b32  	%r17, %r1, 2147483647;
-	setp.ne.s32	%p33, %r17, 2146435072;
-	mov.f64 	%fd33, %fd15;
-	mov.f64 	%fd36, %fd33;
-	@%p33 bra 	BB1_36;
-
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%r18, %temp}, %fd26;
-	}
-	setp.ne.s32	%p34, %r18, 0;
-	mov.f64 	%fd36, %fd15;
-	@%p34 bra 	BB1_36;
-
-	shr.s32 	%r19, %r2, 31;
-	and.b32  	%r20, %r19, -2146435072;
-	add.s32 	%r21, %r20, 2146435072;
-	or.b32  	%r22, %r21, -2147483648;
-	selp.b32	%r23, %r22, %r21, %p1;
-	mov.u32 	%r24, 0;
-	mov.b64 	%fd36, {%r24, %r23};
-	bra.uni 	BB1_36;
-
-BB1_7:
-	setp.eq.s32	%p9, %r3, 7;
-	@%p9 bra 	BB1_18;
-	bra.uni 	BB1_8;
-
-BB1_18:
-	setp.gt.f64	%p18, %fd26, %fd27;
-	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p18;
-	bra.uni 	BB1_41;
-
-BB1_8:
-	setp.eq.s32	%p10, %r3, 8;
-	@%p10 bra 	BB1_17;
-	bra.uni 	BB1_9;
-
-BB1_17:
-	setp.ge.f64	%p17, %fd26, %fd27;
-	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p17;
-	bra.uni 	BB1_41;
-
-BB1_9:
-	setp.eq.s32	%p11, %r3, 9;
-	@%p11 bra 	BB1_16;
-	bra.uni 	BB1_10;
-
-BB1_16:
-	setp.eq.f64	%p16, %fd26, %fd27;
-	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p16;
-	bra.uni 	BB1_41;
-
-BB1_10:
-	setp.eq.s32	%p12, %r3, 10;
-	@%p12 bra 	BB1_15;
-	bra.uni 	BB1_11;
-
-BB1_15:
-	setp.neu.f64	%p15, %fd26, %fd27;
-	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p15;
-	bra.uni 	BB1_41;
-
-BB1_35:
-	setp.gt.f64	%p35, %fd9, 0d3FF0000000000000;
-	selp.b32	%r25, 2146435072, 0, %p35;
-	xor.b32  	%r26, %r25, 2146435072;
-	setp.lt.s32	%p36, %r2, 0;
-	selp.b32	%r27, %r26, %r25, %p36;
-	setp.eq.f64	%p37, %fd26, 0dBFF0000000000000;
-	selp.b32	%r28, 1072693248, %r27, %p37;
-	mov.u32 	%r29, 0;
-	mov.b64 	%fd36, {%r29, %r28};
-
-BB1_36:
-	setp.eq.f64	%p38, %fd27, 0d0000000000000000;
-	setp.eq.f64	%p39, %fd26, 0d3FF0000000000000;
-	or.pred  	%p40, %p39, %p38;
-	selp.f64	%fd39, 0d3FF0000000000000, %fd36, %p40;
-
-BB1_41:
-	st.param.f64	[func_retval0+0], %fd39;
-	ret;
-
-BB1_11:
-	setp.eq.s32	%p13, %r3, 11;
-	@%p13 bra 	BB1_14;
-	bra.uni 	BB1_12;
-
-BB1_14:
-	min.f64 	%fd39, %fd26, %fd27;
-	bra.uni 	BB1_41;
-
-BB1_12:
-	mov.f64 	%fd39, 0dC08F380000000000;
-	setp.ne.s32	%p14, %r3, 12;
-	@%p14 bra 	BB1_41;
-
-	max.f64 	%fd39, %fd26, %fd27;
-	bra.uni 	BB1_41;
-}
-
-	// .globl	_Z6reduceI5SumOpEvPdS1_jT_d
 .visible .func _Z6reduceI5SumOpEvPdS1_jT_d(
 	.param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_0,
 	.param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_1,
@@ -344,11 +43,11 @@ BB1_12:
 	mov.u32 	%r9, %ntid.x;
 	mad.lo.s32 	%r32, %r8, %r9, %r6;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB2_5;
+	@%p1 bra 	BB0_5;
 
 	mov.f64 	%fd77, %fd76;
 
-BB2_2:
+BB0_2:
 	mov.f64 	%fd1, %fd77;
 	mul.wide.u32 	%rd4, %r32, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -356,23 +55,23 @@ BB2_2:
 	add.f64 	%fd78, %fd1, %fd29;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB2_4;
+	@%p2 bra 	BB0_4;
 
 	mul.wide.u32 	%rd6, %r3, 8;
 	add.s64 	%rd7, %rd2, %rd6;
 	ld.f64 	%fd30, [%rd7];
 	add.f64 	%fd78, %fd78, %fd30;
 
-BB2_4:
+BB0_4:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB2_2;
+	@%p3 bra 	BB0_2;
 
-BB2_5:
+BB0_5:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd8, %r6, 8;
 	mov.u64 	%rd9, sdata;
@@ -380,137 +79,137 @@ BB2_5:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB2_9;
+	@%p4 bra 	BB0_9;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB2_8;
+	@%p5 bra 	BB0_8;
 
 	ld.shared.f64 	%fd31, [%rd1+4096];
 	add.f64 	%fd75, %fd74, %fd31;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB2_8:
+BB0_8:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB2_9:
+BB0_9:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB2_13;
+	@%p6 bra 	BB0_13;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB2_12;
+	@%p7 bra 	BB0_12;
 
 	ld.shared.f64 	%fd32, [%rd1+2048];
 	add.f64 	%fd73, %fd72, %fd32;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB2_12:
+BB0_12:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB2_13:
+BB0_13:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB2_17;
+	@%p8 bra 	BB0_17;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB2_16;
+	@%p9 bra 	BB0_16;
 
 	ld.shared.f64 	%fd33, [%rd1+1024];
 	add.f64 	%fd71, %fd70, %fd33;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB2_16:
+BB0_16:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB2_17:
+BB0_17:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB2_21;
+	@%p10 bra 	BB0_21;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB2_20;
+	@%p11 bra 	BB0_20;
 
 	ld.shared.f64 	%fd34, [%rd1+512];
 	add.f64 	%fd69, %fd68, %fd34;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB2_20:
+BB0_20:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB2_21:
+BB0_21:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB2_34;
+	@%p12 bra 	BB0_34;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB2_24;
+	@%p13 bra 	BB0_24;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+256];
 	add.f64 	%fd67, %fd67, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB2_24:
+BB0_24:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB2_26;
+	@%p14 bra 	BB0_26;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+128];
 	add.f64 	%fd66, %fd66, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB2_26:
+BB0_26:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB2_28;
+	@%p15 bra 	BB0_28;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+64];
 	add.f64 	%fd65, %fd65, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB2_28:
+BB0_28:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB2_30;
+	@%p16 bra 	BB0_30;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+32];
 	add.f64 	%fd64, %fd64, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB2_30:
+BB0_30:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB2_32;
+	@%p17 bra 	BB0_32;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+16];
 	add.f64 	%fd63, %fd63, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB2_32:
+BB0_32:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB2_34;
+	@%p18 bra 	BB0_34;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+8];
 	add.f64 	%fd41, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd41;
 
-BB2_34:
+BB0_34:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB2_36;
+	@%p19 bra 	BB0_36;
 
 	ld.shared.f64 	%fd42, [sdata];
 	mul.wide.u32 	%rd10, %r7, 8;
 	add.s64 	%rd11, %rd3, %rd10;
 	st.f64 	[%rd11], %fd42;
 
-BB2_36:
+BB0_36:
 	ret;
 }
 
@@ -538,14 +237,14 @@ BB2_36:
 	ld.param.f64 	%fd40, [_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
 	mov.u32 	%r1, %ctaid.x;
 	setp.ge.u32	%p1, %r1, %r7;
-	@%p1 bra 	BB3_34;
+	@%p1 bra 	BB1_34;
 
 	mov.u32 	%r28, %tid.x;
 	mul.lo.s32 	%r3, %r1, %r6;
 	setp.ge.u32	%p2, %r28, %r6;
-	@%p2 bra 	BB3_3;
+	@%p2 bra 	BB1_3;
 
-BB3_2:
+BB1_2:
 	add.s32 	%r8, %r28, %r3;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -554,9 +253,9 @@ BB3_2:
 	mov.u32 	%r9, %ntid.x;
 	add.s32 	%r28, %r9, %r28;
 	setp.lt.u32	%p3, %r28, %r6;
-	@%p3 bra 	BB3_2;
+	@%p3 bra 	BB1_2;
 
-BB3_3:
+BB1_3:
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
 	mov.u64 	%rd7, sdata;
@@ -565,121 +264,121 @@ BB3_3:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB3_7;
+	@%p4 bra 	BB1_7;
 
 	setp.gt.u32	%p5, %r10, 511;
-	@%p5 bra 	BB3_6;
+	@%p5 bra 	BB1_6;
 
 	ld.shared.f64 	%fd28, [%rd1+4096];
 	add.f64 	%fd40, %fd40, %fd28;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB3_6:
+BB1_6:
 	bar.sync 	0;
 
-BB3_7:
+BB1_7:
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB3_11;
+	@%p6 bra 	BB1_11;
 
 	setp.gt.u32	%p7, %r10, 255;
-	@%p7 bra 	BB3_10;
+	@%p7 bra 	BB1_10;
 
 	ld.shared.f64 	%fd29, [%rd1+2048];
 	add.f64 	%fd40, %fd40, %fd29;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB3_10:
+BB1_10:
 	bar.sync 	0;
 
-BB3_11:
+BB1_11:
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB3_15;
+	@%p8 bra 	BB1_15;
 
 	setp.gt.u32	%p9, %r10, 127;
-	@%p9 bra 	BB3_14;
+	@%p9 bra 	BB1_14;
 
 	ld.shared.f64 	%fd30, [%rd1+1024];
 	add.f64 	%fd40, %fd40, %fd30;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB3_14:
+BB1_14:
 	bar.sync 	0;
 
-BB3_15:
+BB1_15:
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB3_19;
+	@%p10 bra 	BB1_19;
 
 	setp.gt.u32	%p11, %r10, 63;
-	@%p11 bra 	BB3_18;
+	@%p11 bra 	BB1_18;
 
 	ld.shared.f64 	%fd31, [%rd1+512];
 	add.f64 	%fd40, %fd40, %fd31;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB3_18:
+BB1_18:
 	bar.sync 	0;
 
-BB3_19:
+BB1_19:
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB3_32;
+	@%p12 bra 	BB1_32;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB3_22;
+	@%p13 bra 	BB1_22;
 
 	ld.volatile.shared.f64 	%fd32, [%rd1+256];
 	add.f64 	%fd40, %fd40, %fd32;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB3_22:
+BB1_22:
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB3_24;
+	@%p14 bra 	BB1_24;
 
 	ld.volatile.shared.f64 	%fd33, [%rd1+128];
 	add.f64 	%fd40, %fd40, %fd33;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB3_24:
+BB1_24:
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB3_26;
+	@%p15 bra 	BB1_26;
 
 	ld.volatile.shared.f64 	%fd34, [%rd1+64];
 	add.f64 	%fd40, %fd40, %fd34;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB3_26:
+BB1_26:
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB3_28;
+	@%p16 bra 	BB1_28;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+32];
 	add.f64 	%fd40, %fd40, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB3_28:
+BB1_28:
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB3_30;
+	@%p17 bra 	BB1_30;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+16];
 	add.f64 	%fd40, %fd40, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB3_30:
+BB1_30:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB3_32;
+	@%p18 bra 	BB1_32;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+8];
 	add.f64 	%fd38, %fd40, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd38;
 
-BB3_32:
+BB1_32:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB3_34;
+	@%p19 bra 	BB1_34;
 
 	ld.shared.f64 	%fd39, [sdata];
 	mul.wide.u32 	%rd8, %r1, 8;
 	add.s64 	%rd9, %rd3, %rd8;
 	st.f64 	[%rd9], %fd39;
 
-BB3_34:
+BB1_34:
 	ret;
 }
 
@@ -710,15 +409,15 @@ BB3_34:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r8, %r7, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB4_5;
+	@%p1 bra 	BB2_5;
 
 	mul.lo.s32 	%r2, %r6, %r5;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB4_4;
+	@%p2 bra 	BB2_4;
 
 	mov.u32 	%r10, %r1;
 
-BB4_3:
+BB2_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd3, %r3, 8;
 	add.s64 	%rd4, %rd1, %rd3;
@@ -727,14 +426,14 @@ BB4_3:
 	add.s32 	%r4, %r3, %r6;
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
-	@%p3 bra 	BB4_3;
+	@%p3 bra 	BB2_3;
 
-BB4_4:
+BB2_4:
 	mul.wide.u32 	%rd5, %r1, 8;
 	add.s64 	%rd6, %rd2, %rd5;
 	st.f64 	[%rd6], %fd6;
 
-BB4_5:
+BB2_5:
 	ret;
 }
 
@@ -763,11 +462,11 @@ BB4_5:
 	mov.u32 	%r9, %ntid.x;
 	mad.lo.s32 	%r32, %r8, %r9, %r6;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB5_5;
+	@%p1 bra 	BB3_5;
 
 	mov.f64 	%fd77, %fd76;
 
-BB5_2:
+BB3_2:
 	mov.f64 	%fd1, %fd77;
 	mul.wide.u32 	%rd4, %r32, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -775,23 +474,23 @@ BB5_2:
 	max.f64 	%fd78, %fd1, %fd29;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB5_4;
+	@%p2 bra 	BB3_4;
 
 	mul.wide.u32 	%rd6, %r3, 8;
 	add.s64 	%rd7, %rd2, %rd6;
 	ld.f64 	%fd30, [%rd7];
 	max.f64 	%fd78, %fd78, %fd30;
 
-BB5_4:
+BB3_4:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB5_2;
+	@%p3 bra 	BB3_2;
 
-BB5_5:
+BB3_5:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd8, %r6, 8;
 	mov.u64 	%rd9, sdata;
@@ -799,137 +498,137 @@ BB5_5:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB5_9;
+	@%p4 bra 	BB3_9;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB5_8;
+	@%p5 bra 	BB3_8;
 
 	ld.shared.f64 	%fd31, [%rd1+4096];
 	max.f64 	%fd75, %fd74, %fd31;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB5_8:
+BB3_8:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB5_9:
+BB3_9:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB5_13;
+	@%p6 bra 	BB3_13;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB5_12;
+	@%p7 bra 	BB3_12;
 
 	ld.shared.f64 	%fd32, [%rd1+2048];
 	max.f64 	%fd73, %fd72, %fd32;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB5_12:
+BB3_12:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB5_13:
+BB3_13:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB5_17;
+	@%p8 bra 	BB3_17;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB5_16;
+	@%p9 bra 	BB3_16;
 
 	ld.shared.f64 	%fd33, [%rd1+1024];
 	max.f64 	%fd71, %fd70, %fd33;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB5_16:
+BB3_16:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB5_17:
+BB3_17:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB5_21;
+	@%p10 bra 	BB3_21;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB5_20;
+	@%p11 bra 	BB3_20;
 
 	ld.shared.f64 	%fd34, [%rd1+512];
 	max.f64 	%fd69, %fd68, %fd34;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB5_20:
+BB3_20:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB5_21:
+BB3_21:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB5_34;
+	@%p12 bra 	BB3_34;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB5_24;
+	@%p13 bra 	BB3_24;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+256];
 	max.f64 	%fd67, %fd67, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB5_24:
+BB3_24:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB5_26;
+	@%p14 bra 	BB3_26;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+128];
 	max.f64 	%fd66, %fd66, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB5_26:
+BB3_26:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB5_28;
+	@%p15 bra 	BB3_28;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+64];
 	max.f64 	%fd65, %fd65, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB5_28:
+BB3_28:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB5_30;
+	@%p16 bra 	BB3_30;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+32];
 	max.f64 	%fd64, %fd64, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB5_30:
+BB3_30:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB5_32;
+	@%p17 bra 	BB3_32;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+16];
 	max.f64 	%fd63, %fd63, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB5_32:
+BB3_32:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB5_34;
+	@%p18 bra 	BB3_34;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+8];
 	max.f64 	%fd41, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd41;
 
-BB5_34:
+BB3_34:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB5_36;
+	@%p19 bra 	BB3_36;
 
 	ld.shared.f64 	%fd42, [sdata];
 	mul.wide.u32 	%rd10, %r7, 8;
 	add.s64 	%rd11, %rd3, %rd10;
 	st.f64 	[%rd11], %fd42;
 
-BB5_36:
+BB3_36:
 	ret;
 }
 
@@ -957,14 +656,14 @@ BB5_36:
 	ld.param.f64 	%fd40, [_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
 	mov.u32 	%r1, %ctaid.x;
 	setp.ge.u32	%p1, %r1, %r7;
-	@%p1 bra 	BB6_34;
+	@%p1 bra 	BB4_34;
 
 	mov.u32 	%r28, %tid.x;
 	mul.lo.s32 	%r3, %r1, %r6;
 	setp.ge.u32	%p2, %r28, %r6;
-	@%p2 bra 	BB6_3;
+	@%p2 bra 	BB4_3;
 
-BB6_2:
+BB4_2:
 	add.s32 	%r8, %r28, %r3;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -973,9 +672,9 @@ BB6_2:
 	mov.u32 	%r9, %ntid.x;
 	add.s32 	%r28, %r9, %r28;
 	setp.lt.u32	%p3, %r28, %r6;
-	@%p3 bra 	BB6_2;
+	@%p3 bra 	BB4_2;
 
-BB6_3:
+BB4_3:
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
 	mov.u64 	%rd7, sdata;
@@ -984,121 +683,121 @@ BB6_3:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB6_7;
+	@%p4 bra 	BB4_7;
 
 	setp.gt.u32	%p5, %r10, 511;
-	@%p5 bra 	BB6_6;
+	@%p5 bra 	BB4_6;
 
 	ld.shared.f64 	%fd28, [%rd1+4096];
 	max.f64 	%fd40, %fd40, %fd28;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB6_6:
+BB4_6:
 	bar.sync 	0;
 
-BB6_7:
+BB4_7:
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB6_11;
+	@%p6 bra 	BB4_11;
 
 	setp.gt.u32	%p7, %r10, 255;
-	@%p7 bra 	BB6_10;
+	@%p7 bra 	BB4_10;
 
 	ld.shared.f64 	%fd29, [%rd1+2048];
 	max.f64 	%fd40, %fd40, %fd29;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB6_10:
+BB4_10:
 	bar.sync 	0;
 
-BB6_11:
+BB4_11:
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB6_15;
+	@%p8 bra 	BB4_15;
 
 	setp.gt.u32	%p9, %r10, 127;
-	@%p9 bra 	BB6_14;
+	@%p9 bra 	BB4_14;
 
 	ld.shared.f64 	%fd30, [%rd1+1024];
 	max.f64 	%fd40, %fd40, %fd30;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB6_14:
+BB4_14:
 	bar.sync 	0;
 
-BB6_15:
+BB4_15:
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB6_19;
+	@%p10 bra 	BB4_19;
 
 	setp.gt.u32	%p11, %r10, 63;
-	@%p11 bra 	BB6_18;
+	@%p11 bra 	BB4_18;
 
 	ld.shared.f64 	%fd31, [%rd1+512];
 	max.f64 	%fd40, %fd40, %fd31;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB6_18:
+BB4_18:
 	bar.sync 	0;
 
-BB6_19:
+BB4_19:
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB6_32;
+	@%p12 bra 	BB4_32;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB6_22;
+	@%p13 bra 	BB4_22;
 
 	ld.volatile.shared.f64 	%fd32, [%rd1+256];
 	max.f64 	%fd40, %fd40, %fd32;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB6_22:
+BB4_22:
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB6_24;
+	@%p14 bra 	BB4_24;
 
 	ld.volatile.shared.f64 	%fd33, [%rd1+128];
 	max.f64 	%fd40, %fd40, %fd33;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB6_24:
+BB4_24:
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB6_26;
+	@%p15 bra 	BB4_26;
 
 	ld.volatile.shared.f64 	%fd34, [%rd1+64];
 	max.f64 	%fd40, %fd40, %fd34;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB6_26:
+BB4_26:
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB6_28;
+	@%p16 bra 	BB4_28;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+32];
 	max.f64 	%fd40, %fd40, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB6_28:
+BB4_28:
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB6_30;
+	@%p17 bra 	BB4_30;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+16];
 	max.f64 	%fd40, %fd40, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB6_30:
+BB4_30:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB6_32;
+	@%p18 bra 	BB4_32;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+8];
 	max.f64 	%fd38, %fd40, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd38;
 
-BB6_32:
+BB4_32:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB6_34;
+	@%p19 bra 	BB4_34;
 
 	ld.shared.f64 	%fd39, [sdata];
 	mul.wide.u32 	%rd8, %r1, 8;
 	add.s64 	%rd9, %rd3, %rd8;
 	st.f64 	[%rd9], %fd39;
 
-BB6_34:
+BB4_34:
 	ret;
 }
 
@@ -1129,15 +828,15 @@ BB6_34:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r8, %r7, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB7_5;
+	@%p1 bra 	BB5_5;
 
 	mul.lo.s32 	%r2, %r6, %r5;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB7_4;
+	@%p2 bra 	BB5_4;
 
 	mov.u32 	%r10, %r1;
 
-BB7_3:
+BB5_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd3, %r3, 8;
 	add.s64 	%rd4, %rd1, %rd3;
@@ -1146,14 +845,14 @@ BB7_3:
 	add.s32 	%r4, %r3, %r6;
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
-	@%p3 bra 	BB7_3;
+	@%p3 bra 	BB5_3;
 
-BB7_4:
+BB5_4:
 	mul.wide.u32 	%rd5, %r1, 8;
 	add.s64 	%rd6, %rd2, %rd5;
 	st.f64 	[%rd6], %fd6;
 
-BB7_5:
+BB5_5:
 	ret;
 }
 
@@ -1182,11 +881,11 @@ BB7_5:
 	mov.u32 	%r9, %ntid.x;
 	mad.lo.s32 	%r32, %r8, %r9, %r6;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB8_5;
+	@%p1 bra 	BB6_5;
 
 	mov.f64 	%fd77, %fd76;
 
-BB8_2:
+BB6_2:
 	mov.f64 	%fd1, %fd77;
 	mul.wide.u32 	%rd4, %r32, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -1194,23 +893,23 @@ BB8_2:
 	min.f64 	%fd78, %fd1, %fd29;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB8_4;
+	@%p2 bra 	BB6_4;
 
 	mul.wide.u32 	%rd6, %r3, 8;
 	add.s64 	%rd7, %rd2, %rd6;
 	ld.f64 	%fd30, [%rd7];
 	min.f64 	%fd78, %fd78, %fd30;
 
-BB8_4:
+BB6_4:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB8_2;
+	@%p3 bra 	BB6_2;
 
-BB8_5:
+BB6_5:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd8, %r6, 8;
 	mov.u64 	%rd9, sdata;
@@ -1218,137 +917,137 @@ BB8_5:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB8_9;
+	@%p4 bra 	BB6_9;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB8_8;
+	@%p5 bra 	BB6_8;
 
 	ld.shared.f64 	%fd31, [%rd1+4096];
 	min.f64 	%fd75, %fd74, %fd31;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB8_8:
+BB6_8:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB8_9:
+BB6_9:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB8_13;
+	@%p6 bra 	BB6_13;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB8_12;
+	@%p7 bra 	BB6_12;
 
 	ld.shared.f64 	%fd32, [%rd1+2048];
 	min.f64 	%fd73, %fd72, %fd32;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB8_12:
+BB6_12:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB8_13:
+BB6_13:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB8_17;
+	@%p8 bra 	BB6_17;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB8_16;
+	@%p9 bra 	BB6_16;
 
 	ld.shared.f64 	%fd33, [%rd1+1024];
 	min.f64 	%fd71, %fd70, %fd33;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB8_16:
+BB6_16:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB8_17:
+BB6_17:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB8_21;
+	@%p10 bra 	BB6_21;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB8_20;
+	@%p11 bra 	BB6_20;
 
 	ld.shared.f64 	%fd34, [%rd1+512];
 	min.f64 	%fd69, %fd68, %fd34;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB8_20:
+BB6_20:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB8_21:
+BB6_21:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB8_34;
+	@%p12 bra 	BB6_34;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB8_24;
+	@%p13 bra 	BB6_24;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+256];
 	min.f64 	%fd67, %fd67, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB8_24:
+BB6_24:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB8_26;
+	@%p14 bra 	BB6_26;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+128];
 	min.f64 	%fd66, %fd66, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB8_26:
+BB6_26:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB8_28;
+	@%p15 bra 	BB6_28;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+64];
 	min.f64 	%fd65, %fd65, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB8_28:
+BB6_28:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB8_30;
+	@%p16 bra 	BB6_30;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+32];
 	min.f64 	%fd64, %fd64, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB8_30:
+BB6_30:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB8_32;
+	@%p17 bra 	BB6_32;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+16];
 	min.f64 	%fd63, %fd63, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB8_32:
+BB6_32:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB8_34;
+	@%p18 bra 	BB6_34;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+8];
 	min.f64 	%fd41, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd41;
 
-BB8_34:
+BB6_34:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB8_36;
+	@%p19 bra 	BB6_36;
 
 	ld.shared.f64 	%fd42, [sdata];
 	mul.wide.u32 	%rd10, %r7, 8;
 	add.s64 	%rd11, %rd3, %rd10;
 	st.f64 	[%rd11], %fd42;
 
-BB8_36:
+BB6_36:
 	ret;
 }
 
@@ -1376,14 +1075,14 @@ BB8_36:
 	ld.param.f64 	%fd40, [_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
 	mov.u32 	%r1, %ctaid.x;
 	setp.ge.u32	%p1, %r1, %r7;
-	@%p1 bra 	BB9_34;
+	@%p1 bra 	BB7_34;
 
 	mov.u32 	%r28, %tid.x;
 	mul.lo.s32 	%r3, %r1, %r6;
 	setp.ge.u32	%p2, %r28, %r6;
-	@%p2 bra 	BB9_3;
+	@%p2 bra 	BB7_3;
 
-BB9_2:
+BB7_2:
 	add.s32 	%r8, %r28, %r3;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -1392,9 +1091,9 @@ BB9_2:
 	mov.u32 	%r9, %ntid.x;
 	add.s32 	%r28, %r9, %r28;
 	setp.lt.u32	%p3, %r28, %r6;
-	@%p3 bra 	BB9_2;
+	@%p3 bra 	BB7_2;
 
-BB9_3:
+BB7_3:
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
 	mov.u64 	%rd7, sdata;
@@ -1403,121 +1102,121 @@ BB9_3:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB9_7;
+	@%p4 bra 	BB7_7;
 
 	setp.gt.u32	%p5, %r10, 511;
-	@%p5 bra 	BB9_6;
+	@%p5 bra 	BB7_6;
 
 	ld.shared.f64 	%fd28, [%rd1+4096];
 	min.f64 	%fd40, %fd40, %fd28;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB9_6:
+BB7_6:
 	bar.sync 	0;
 
-BB9_7:
+BB7_7:
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB9_11;
+	@%p6 bra 	BB7_11;
 
 	setp.gt.u32	%p7, %r10, 255;
-	@%p7 bra 	BB9_10;
+	@%p7 bra 	BB7_10;
 
 	ld.shared.f64 	%fd29, [%rd1+2048];
 	min.f64 	%fd40, %fd40, %fd29;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB9_10:
+BB7_10:
 	bar.sync 	0;
 
-BB9_11:
+BB7_11:
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB9_15;
+	@%p8 bra 	BB7_15;
 
 	setp.gt.u32	%p9, %r10, 127;
-	@%p9 bra 	BB9_14;
+	@%p9 bra 	BB7_14;
 
 	ld.shared.f64 	%fd30, [%rd1+1024];
 	min.f64 	%fd40, %fd40, %fd30;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB9_14:
+BB7_14:
 	bar.sync 	0;
 
-BB9_15:
+BB7_15:
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB9_19;
+	@%p10 bra 	BB7_19;
 
 	setp.gt.u32	%p11, %r10, 63;
-	@%p11 bra 	BB9_18;
+	@%p11 bra 	BB7_18;
 
 	ld.shared.f64 	%fd31, [%rd1+512];
 	min.f64 	%fd40, %fd40, %fd31;
 	st.shared.f64 	[%rd1], %fd40;
 
-BB9_18:
+BB7_18:
 	bar.sync 	0;
 
-BB9_19:
+BB7_19:
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB9_32;
+	@%p12 bra 	BB7_32;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB9_22;
+	@%p13 bra 	BB7_22;
 
 	ld.volatile.shared.f64 	%fd32, [%rd1+256];
 	min.f64 	%fd40, %fd40, %fd32;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB9_22:
+BB7_22:
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB9_24;
+	@%p14 bra 	BB7_24;
 
 	ld.volatile.shared.f64 	%fd33, [%rd1+128];
 	min.f64 	%fd40, %fd40, %fd33;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB9_24:
+BB7_24:
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB9_26;
+	@%p15 bra 	BB7_26;
 
 	ld.volatile.shared.f64 	%fd34, [%rd1+64];
 	min.f64 	%fd40, %fd40, %fd34;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB9_26:
+BB7_26:
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB9_28;
+	@%p16 bra 	BB7_28;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+32];
 	min.f64 	%fd40, %fd40, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB9_28:
+BB7_28:
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB9_30;
+	@%p17 bra 	BB7_30;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+16];
 	min.f64 	%fd40, %fd40, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd40;
 
-BB9_30:
+BB7_30:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB9_32;
+	@%p18 bra 	BB7_32;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+8];
 	min.f64 	%fd38, %fd40, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd38;
 
-BB9_32:
+BB7_32:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB9_34;
+	@%p19 bra 	BB7_34;
 
 	ld.shared.f64 	%fd39, [sdata];
 	mul.wide.u32 	%rd8, %r1, 8;
 	add.s64 	%rd9, %rd3, %rd8;
 	st.f64 	[%rd9], %fd39;
 
-BB9_34:
+BB7_34:
 	ret;
 }
 
@@ -1548,15 +1247,15 @@ BB9_34:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r8, %r7, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB10_5;
+	@%p1 bra 	BB8_5;
 
 	mul.lo.s32 	%r2, %r6, %r5;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB10_4;
+	@%p2 bra 	BB8_4;
 
 	mov.u32 	%r10, %r1;
 
-BB10_3:
+BB8_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd3, %r3, 8;
 	add.s64 	%rd4, %rd1, %rd3;
@@ -1565,14 +1264,14 @@ BB10_3:
 	add.s32 	%r4, %r3, %r6;
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
-	@%p3 bra 	BB10_3;
+	@%p3 bra 	BB8_3;
 
-BB10_4:
+BB8_4:
 	mul.wide.u32 	%rd5, %r1, 8;
 	add.s64 	%rd6, %rd2, %rd5;
 	st.f64 	[%rd6], %fd6;
 
-BB10_5:
+BB8_5:
 	ret;
 }
 
@@ -1601,11 +1300,11 @@ BB10_5:
 	mov.u32 	%r9, %ntid.x;
 	mad.lo.s32 	%r32, %r8, %r9, %r6;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB11_5;
+	@%p1 bra 	BB9_5;
 
 	mov.f64 	%fd77, %fd76;
 
-BB11_2:
+BB9_2:
 	mov.f64 	%fd1, %fd77;
 	mul.wide.u32 	%rd4, %r32, 8;
 	add.s64 	%rd5, %rd2, %rd4;
@@ -1613,23 +1312,23 @@ BB11_2:
 	mul.f64 	%fd78, %fd1, %fd29;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB11_4;
+	@%p2 bra 	BB9_4;
 
 	mul.wide.u32 	%rd6, %r3, 8;
 	add.s64 	%rd7, %rd2, %rd6;
 	ld.f64 	%fd30, [%rd7];
 	mul.f64 	%fd78, %fd78, %fd30;
 
-BB11_4:
+BB9_4:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB11_2;
+	@%p3 bra 	BB9_2;
 
-BB11_5:
+BB9_5:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd8, %r6, 8;
 	mov.u64 	%rd9, sdata;
@@ -1637,137 +1336,137 @@ BB11_5:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB11_9;
+	@%p4 bra 	BB9_9;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB11_8;
+	@%p5 bra 	BB9_8;
 
 	ld.shared.f64 	%fd31, [%rd1+4096];
 	mul.f64 	%fd75, %fd74, %fd31;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB11_8:
+BB9_8:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB11_9:
+BB9_9:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB11_13;
+	@%p6 bra 	BB9_13;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB11_12;
+	@%p7 bra 	BB9_12;
 
 	ld.shared.f64 	%fd32, [%rd1+2048];
 	mul.f64 	%fd73, %fd72, %fd32;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB11_12:
+BB9_12:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB11_13:
+BB9_13:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB11_17;
+	@%p8 bra 	BB9_17;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB11_16;
+	@%p9 bra 	BB9_16;
 
 	ld.shared.f64 	%fd33, [%rd1+1024];
 	mul.f64 	%fd71, %fd70, %fd33;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB11_16:
+BB9_16:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB11_17:
+BB9_17:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB11_21;
+	@%p10 bra 	BB9_21;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB11_20;
+	@%p11 bra 	BB9_20;
 
 	ld.shared.f64 	%fd34, [%rd1+512];
 	mul.f64 	%fd69, %fd68, %fd34;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB11_20:
+BB9_20:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB11_21:
+BB9_21:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB11_34;
+	@%p12 bra 	BB9_34;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB11_24;
+	@%p13 bra 	BB9_24;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+256];
 	mul.f64 	%fd67, %fd67, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB11_24:
+BB9_24:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB11_26;
+	@%p14 bra 	BB9_26;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+128];
 	mul.f64 	%fd66, %fd66, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB11_26:
+BB9_26:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB11_28;
+	@%p15 bra 	BB9_28;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+64];
 	mul.f64 	%fd65, %fd65, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB11_28:
+BB9_28:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB11_30;
+	@%p16 bra 	BB9_30;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+32];
 	mul.f64 	%fd64, %fd64, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB11_30:
+BB9_30:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB11_32;
+	@%p17 bra 	BB9_32;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+16];
 	mul.f64 	%fd63, %fd63, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB11_32:
+BB9_32:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB11_34;
+	@%p18 bra 	BB9_34;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+8];
 	mul.f64 	%fd41, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd41;
 
-BB11_34:
+BB9_34:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB11_36;
+	@%p19 bra 	BB9_36;
 
 	ld.shared.f64 	%fd42, [sdata];
 	mul.wide.u32 	%rd10, %r7, 8;
 	add.s64 	%rd11, %rd3, %rd10;
 	st.f64 	[%rd11], %fd42;
 
-BB11_36:
+BB9_36:
 	ret;
 }
 
@@ -1796,14 +1495,14 @@ BB11_36:
 	ld.param.f64 	%fd42, [_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6];
 	mov.u32 	%r7, %ctaid.x;
 	setp.ge.u32	%p1, %r7, %r6;
-	@%p1 bra 	BB12_34;
+	@%p1 bra 	BB10_34;
 
 	mov.u32 	%r29, %tid.x;
 	mul.lo.s32 	%r2, %r7, %r5;
 	setp.ge.u32	%p2, %r29, %r5;
-	@%p2 bra 	BB12_3;
+	@%p2 bra 	BB10_3;
 
-BB12_2:
+BB10_2:
 	add.s32 	%r9, %r29, %r2;
 	mul.wide.u32 	%rd5, %r9, 8;
 	add.s64 	%rd6, %rd2, %rd5;
@@ -1812,9 +1511,9 @@ BB12_2:
 	mov.u32 	%r10, %ntid.x;
 	add.s32 	%r29, %r10, %r29;
 	setp.lt.u32	%p3, %r29, %r5;
-	@%p3 bra 	BB12_2;
+	@%p3 bra 	BB10_2;
 
-BB12_3:
+BB10_3:
 	mov.u32 	%r11, %tid.x;
 	mul.wide.u32 	%rd7, %r11, 8;
 	mov.u64 	%rd8, sdata;
@@ -1823,114 +1522,114 @@ BB12_3:
 	bar.sync 	0;
 	mov.u32 	%r12, %ntid.x;
 	setp.lt.u32	%p4, %r12, 1024;
-	@%p4 bra 	BB12_7;
+	@%p4 bra 	BB10_7;
 
 	setp.gt.u32	%p5, %r11, 511;
-	@%p5 bra 	BB12_6;
+	@%p5 bra 	BB10_6;
 
 	ld.shared.f64 	%fd28, [%rd1+4096];
 	add.f64 	%fd42, %fd42, %fd28;
 	st.shared.f64 	[%rd1], %fd42;
 
-BB12_6:
+BB10_6:
 	bar.sync 	0;
 
-BB12_7:
+BB10_7:
 	setp.lt.u32	%p6, %r12, 512;
-	@%p6 bra 	BB12_11;
+	@%p6 bra 	BB10_11;
 
 	setp.gt.u32	%p7, %r11, 255;
-	@%p7 bra 	BB12_10;
+	@%p7 bra 	BB10_10;
 
 	ld.shared.f64 	%fd29, [%rd1+2048];
 	add.f64 	%fd42, %fd42, %fd29;
 	st.shared.f64 	[%rd1], %fd42;
 
-BB12_10:
+BB10_10:
 	bar.sync 	0;
 
-BB12_11:
+BB10_11:
 	setp.lt.u32	%p8, %r12, 256;
-	@%p8 bra 	BB12_15;
+	@%p8 bra 	BB10_15;
 
 	setp.gt.u32	%p9, %r11, 127;
-	@%p9 bra 	BB12_14;
+	@%p9 bra 	BB10_14;
 
 	ld.shared.f64 	%fd30, [%rd1+1024];
 	add.f64 	%fd42, %fd42, %fd30;
 	st.shared.f64 	[%rd1], %fd42;
 
-BB12_14:
+BB10_14:
 	bar.sync 	0;
 
-BB12_15:
+BB10_15:
 	setp.lt.u32	%p10, %r12, 128;
-	@%p10 bra 	BB12_19;
+	@%p10 bra 	BB10_19;
 
 	setp.gt.u32	%p11, %r11, 63;
-	@%p11 bra 	BB12_18;
+	@%p11 bra 	BB10_18;
 
 	ld.shared.f64 	%fd31, [%rd1+512];
 	add.f64 	%fd42, %fd42, %fd31;
 	st.shared.f64 	[%rd1], %fd42;
 
-BB12_18:
+BB10_18:
 	bar.sync 	0;
 
-BB12_19:
+BB10_19:
 	setp.gt.u32	%p12, %r11, 31;
-	@%p12 bra 	BB12_32;
+	@%p12 bra 	BB10_32;
 
 	setp.lt.u32	%p13, %r12, 64;
-	@%p13 bra 	BB12_22;
+	@%p13 bra 	BB10_22;
 
 	ld.volatile.shared.f64 	%fd32, [%rd1+256];
 	add.f64 	%fd42, %fd42, %fd32;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB12_22:
+BB10_22:
 	setp.lt.u32	%p14, %r12, 32;
-	@%p14 bra 	BB12_24;
+	@%p14 bra 	BB10_24;
 
 	ld.volatile.shared.f64 	%fd33, [%rd1+128];
 	add.f64 	%fd42, %fd42, %fd33;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB12_24:
+BB10_24:
 	setp.lt.u32	%p15, %r12, 16;
-	@%p15 bra 	BB12_26;
+	@%p15 bra 	BB10_26;
 
 	ld.volatile.shared.f64 	%fd34, [%rd1+64];
 	add.f64 	%fd42, %fd42, %fd34;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB12_26:
+BB10_26:
 	setp.lt.u32	%p16, %r12, 8;
-	@%p16 bra 	BB12_28;
+	@%p16 bra 	BB10_28;
 
 	ld.volatile.shared.f64 	%fd35, [%rd1+32];
 	add.f64 	%fd42, %fd42, %fd35;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB12_28:
+BB10_28:
 	setp.lt.u32	%p17, %r12, 4;
-	@%p17 bra 	BB12_30;
+	@%p17 bra 	BB10_30;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+16];
 	add.f64 	%fd42, %fd42, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB12_30:
+BB10_30:
 	setp.lt.u32	%p18, %r12, 2;
-	@%p18 bra 	BB12_32;
+	@%p18 bra 	BB10_32;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+8];
 	add.f64 	%fd38, %fd42, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd38;
 
-BB12_32:
+BB10_32:
 	setp.ne.s32	%p19, %r11, 0;
-	@%p19 bra 	BB12_34;
+	@%p19 bra 	BB10_34;
 
 	ld.shared.f64 	%fd39, [sdata];
 	cvt.rn.f64.s64	%fd40, %rd4;
@@ -1939,7 +1638,7 @@ BB12_32:
 	add.s64 	%rd10, %rd3, %rd9;
 	st.f64 	[%rd10], %fd41;
 
-BB12_34:
+BB10_34:
 	ret;
 }
 
@@ -1971,15 +1670,15 @@ BB12_34:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB13_5;
+	@%p1 bra 	BB11_5;
 
 	mul.lo.s32 	%r2, %r6, %r5;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB13_4;
+	@%p2 bra 	BB11_4;
 
 	mov.u32 	%r10, %r1;
 
-BB13_3:
+BB11_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -1988,16 +1687,16 @@ BB13_3:
 	add.s32 	%r4, %r3, %r6;
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
-	@%p3 bra 	BB13_3;
+	@%p3 bra 	BB11_3;
 
-BB13_4:
+BB11_4:
 	cvt.rn.f64.s64	%fd6, %rd3;
 	div.rn.f64 	%fd7, %fd8, %fd6;
 	mul.wide.u32 	%rd6, %r1, 8;
 	add.s64 	%rd7, %rd2, %rd6;
 	st.f64 	[%rd7], %fd7;
 
-BB13_5:
+BB11_5:
 	ret;
 }
 
@@ -2029,10 +1728,10 @@ BB13_5:
 	setp.gt.s32	%p1, %r2, %r1;
 	setp.lt.s32	%p2, %r3, %r5;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB14_2;
-	bra.uni 	BB14_1;
+	@!%p3 bra 	BB12_2;
+	bra.uni 	BB12_1;
 
-BB14_1:
+BB12_1:
 	cvta.to.global.u64 	%rd2, %rd1;
 	mad.lo.s32 	%r12, %r1, %r4, %r2;
 	mul.wide.s32 	%rd3, %r12, 8;
@@ -2042,7 +1741,7 @@ BB14_1:
 	add.s64 	%rd6, %rd2, %rd5;
 	st.global.f64 	[%rd6], %fd1;
 
-BB14_2:
+BB12_2:
 	ret;
 }
 
@@ -2075,14 +1774,14 @@ BB14_2:
 	mad.lo.s32 	%r1, %r8, %r9, %r11;
 	mul.lo.s32 	%r12, %r3, %r2;
 	setp.ge.s32	%p1, %r1, %r12;
-	@%p1 bra 	BB15_2;
+	@%p1 bra 	BB13_2;
 
 	cvta.to.global.u64 	%rd2, %rd1;
 	mul.wide.s32 	%rd3, %r1, 8;
 	add.s64 	%rd4, %rd2, %rd3;
 	st.global.f64 	[%rd4], %fd1;
 
-BB15_2:
+BB13_2:
 	ret;
 }
 
@@ -2116,10 +1815,10 @@ BB15_2:
 	setp.lt.s32	%p1, %r7, %r2;
 	setp.lt.s32	%p2, %r11, %r3;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB16_2;
-	bra.uni 	BB16_1;
+	@!%p3 bra 	BB14_2;
+	bra.uni 	BB14_1;
 
-BB16_1:
+BB14_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.s32 	%rd4, %r1, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -2128,7 +1827,7 @@ BB16_1:
 	add.s64 	%rd7, %rd6, %rd4;
 	st.global.f64 	[%rd7], %fd1;
 
-BB16_2:
+BB14_2:
 	ret;
 }
 
@@ -2161,10 +1860,10 @@ BB16_2:
 	setp.lt.s32	%p1, %r1, %r4;
 	setp.lt.s32	%p2, %r2, %r3;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB17_2;
-	bra.uni 	BB17_1;
+	@!%p3 bra 	BB15_2;
+	bra.uni 	BB15_1;
 
-BB17_1:
+BB15_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mad.lo.s32 	%r11, %r1, %r3, %r2;
 	mul.wide.s32 	%rd4, %r11, 8;
@@ -2176,7 +1875,7 @@ BB17_1:
 	add.s64 	%rd7, %rd6, %rd4;
 	st.global.f64 	[%rd7], %fd3;
 
-BB17_2:
+BB15_2:
 	ret;
 }
 
@@ -2211,10 +1910,10 @@ BB17_2:
 	setp.lt.s32	%p1, %r1, %r5;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB18_4;
-	bra.uni 	BB18_1;
+	@!%p3 bra 	BB16_4;
+	bra.uni 	BB16_1;
 
-BB18_1:
+BB16_1:
 	cvta.to.global.u64 	%rd4, %rd1;
 	mad.lo.s32 	%r3, %r1, %r4, %r2;
 	mul.wide.s32 	%rd5, %r3, 8;
@@ -2222,18 +1921,18 @@ BB18_1:
 	ld.global.f64 	%fd4, [%rd6];
 	mov.f64 	%fd5, 0d0000000000000000;
 	setp.leu.f64	%p4, %fd4, 0d0000000000000000;
-	@%p4 bra 	BB18_3;
+	@%p4 bra 	BB16_3;
 
 	cvta.to.global.u64 	%rd7, %rd2;
 	add.s64 	%rd9, %rd7, %rd5;
 	ld.global.f64 	%fd5, [%rd9];
 
-BB18_3:
+BB16_3:
 	cvta.to.global.u64 	%rd10, %rd3;
 	add.s64 	%rd12, %rd10, %rd5;
 	st.global.f64 	[%rd12], %fd5;
 
-BB18_4:
+BB16_4:
 	ret;
 }
 
@@ -2270,10 +1969,10 @@ BB18_4:
 	setp.lt.s32	%p1, %r1, %r5;
 	setp.lt.s32	%p2, %r2, %r3;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB19_2;
-	bra.uni 	BB19_1;
+	@!%p3 bra 	BB17_2;
+	bra.uni 	BB17_1;
 
-BB19_1:
+BB17_1:
 	cvta.to.global.u64 	%rd4, %rd1;
 	mad.lo.s32 	%r12, %r1, %r3, %r2;
 	mul.wide.s32 	%rd5, %r12, 8;
@@ -2289,7 +1988,7 @@ BB19_1:
 	add.s64 	%rd11, %rd10, %rd5;
 	st.global.f64 	[%rd11], %fd3;
 
-BB19_2:
+BB17_2:
 	ret;
 }
 
@@ -2333,10 +2032,10 @@ BB19_2:
 	setp.lt.s32	%p1, %r7, %r2;
 	setp.lt.s32	%p2, %r11, %r3;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB20_6;
-	bra.uni 	BB20_1;
+	@!%p3 bra 	BB18_6;
+	bra.uni 	BB18_1;
 
-BB20_1:
+BB18_1:
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.s32 	%rd5, %r1, 8;
 	add.s64 	%rd6, %rd4, %rd5;
@@ -2346,39 +2045,39 @@ BB20_1:
 	setp.lt.f64	%p4, %fd8, %fd3;
 	cvta.to.global.u64 	%rd7, %rd3;
 	add.s64 	%rd1, %rd7, %rd5;
-	@%p4 bra 	BB20_5;
-	bra.uni 	BB20_2;
+	@%p4 bra 	BB18_5;
+	bra.uni 	BB18_2;
 
-BB20_5:
+BB18_5:
 	st.global.f64 	[%rd1], %fd4;
-	bra.uni 	BB20_6;
+	bra.uni 	BB18_6;
 
-BB20_2:
+BB18_2:
 	setp.lt.f64	%p5, %fd1, %fd2;
-	@%p5 bra 	BB20_4;
-	bra.uni 	BB20_3;
+	@%p5 bra 	BB18_4;
+	bra.uni 	BB18_3;
 
-BB20_4:
+BB18_4:
 	st.global.f64 	[%rd1], %fd5;
-	bra.uni 	BB20_6;
+	bra.uni 	BB18_6;
 
-BB20_3:
+BB18_3:
 	st.global.f64 	[%rd1], %fd6;
 
-BB20_6:
+BB18_6:
 	ret;
 }
 
-	// .globl	binCellOp
-.visible .entry binCellOp(
-	.param .u64 binCellOp_param_0,
-	.param .u64 binCellOp_param_1,
-	.param .u64 binCellOp_param_2,
-	.param .u32 binCellOp_param_3,
-	.param .u32 binCellOp_param_4,
-	.param .u32 binCellOp_param_5,
-	.param .u32 binCellOp_param_6,
-	.param .u32 binCellOp_param_7
+	// .globl	matrix_matrix_cellwise_op
+.visible .entry matrix_matrix_cellwise_op(
+	.param .u64 matrix_matrix_cellwise_op_param_0,
+	.param .u64 matrix_matrix_cellwise_op_param_1,
+	.param .u64 matrix_matrix_cellwise_op_param_2,
+	.param .u32 matrix_matrix_cellwise_op_param_3,
+	.param .u32 matrix_matrix_cellwise_op_param_4,
+	.param .u32 matrix_matrix_cellwise_op_param_5,
+	.param .u32 matrix_matrix_cellwise_op_param_6,
+	.param .u32 matrix_matrix_cellwise_op_param_7
 )
 {
 	.reg .pred 	%p<52>;
@@ -2387,14 +2086,14 @@ BB20_6:
 	.reg .b64 	%rd<15>;
 
 
-	ld.param.u64 	%rd2, [binCellOp_param_0];
-	ld.param.u64 	%rd3, [binCellOp_param_1];
-	ld.param.u64 	%rd4, [binCellOp_param_2];
-	ld.param.u32 	%r14, [binCellOp_param_3];
-	ld.param.u32 	%r10, [binCellOp_param_4];
-	ld.param.u32 	%r11, [binCellOp_param_5];
-	ld.param.u32 	%r12, [binCellOp_param_6];
-	ld.param.u32 	%r13, [binCellOp_param_7];
+	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_param_0];
+	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_param_1];
+	ld.param.u64 	%rd4, [matrix_matrix_cellwise_op_param_2];
+	ld.param.u32 	%r14, [matrix_matrix_cellwise_op_param_3];
+	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_param_4];
+	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_param_5];
+	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_param_6];
+	ld.param.u32 	%r13, [matrix_matrix_cellwise_op_param_7];
 	mov.u32 	%r15, %ntid.x;
 	mov.u32 	%r16, %ctaid.x;
 	mov.u32 	%r17, %tid.x;
@@ -2406,42 +2105,42 @@ BB20_6:
 	setp.lt.s32	%p2, %r1, %r14;
 	setp.lt.s32	%p3, %r2, %r10;
 	and.pred  	%p4, %p2, %p3;
-	@!%p4 bra 	BB21_55;
-	bra.uni 	BB21_1;
+	@!%p4 bra 	BB19_55;
+	bra.uni 	BB19_1;
 
-BB21_1:
+BB19_1:
 	mad.lo.s32 	%r3, %r1, %r10, %r2;
 	setp.eq.s32	%p5, %r11, 1;
 	mov.u32 	%r54, %r1;
-	@%p5 bra 	BB21_5;
+	@%p5 bra 	BB19_5;
 
 	setp.ne.s32	%p6, %r11, 2;
 	mov.u32 	%r55, %r3;
-	@%p6 bra 	BB21_4;
+	@%p6 bra 	BB19_4;
 
 	mov.u32 	%r55, %r2;
 
-BB21_4:
+BB19_4:
 	mov.u32 	%r49, %r55;
 	mov.u32 	%r4, %r49;
 	mov.u32 	%r54, %r4;
 
-BB21_5:
+BB19_5:
 	mov.u32 	%r5, %r54;
 	setp.eq.s32	%p7, %r12, 1;
 	mov.u32 	%r52, %r1;
-	@%p7 bra 	BB21_9;
+	@%p7 bra 	BB19_9;
 
 	setp.ne.s32	%p8, %r12, 2;
 	mov.u32 	%r53, %r3;
-	@%p8 bra 	BB21_8;
+	@%p8 bra 	BB19_8;
 
 	mov.u32 	%r53, %r2;
 
-BB21_8:
+BB19_8:
 	mov.u32 	%r52, %r53;
 
-BB21_9:
+BB19_9:
 	cvta.to.global.u64 	%rd5, %rd3;
 	cvta.to.global.u64 	%rd6, %rd2;
 	mul.wide.s32 	%rd7, %r5, 8;
@@ -2450,49 +2149,49 @@ BB21_9:
 	mul.wide.s32 	%rd9, %r52, 8;
 	add.s64 	%rd10, %rd5, %rd9;
 	ld.global.f64 	%fd2, [%rd10];
-	mov.f64 	%fd39, 0dC08F380000000000;
+	mov.f64 	%fd39, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p9, %r13, 5;
-	@%p9 bra 	BB21_19;
+	@%p9 bra 	BB19_19;
 
 	setp.gt.s32	%p19, %r13, 2;
-	@%p19 bra 	BB21_15;
+	@%p19 bra 	BB19_15;
 
 	setp.eq.s32	%p23, %r13, 0;
-	@%p23 bra 	BB21_53;
+	@%p23 bra 	BB19_53;
 
 	setp.eq.s32	%p24, %r13, 1;
-	@%p24 bra 	BB21_52;
-	bra.uni 	BB21_13;
+	@%p24 bra 	BB19_52;
+	bra.uni 	BB19_13;
 
-BB21_52:
+BB19_52:
 	sub.f64 	%fd39, %fd1, %fd2;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_19:
+BB19_19:
 	setp.gt.s32	%p10, %r13, 8;
-	@%p10 bra 	BB21_24;
+	@%p10 bra 	BB19_24;
 
 	setp.eq.s32	%p16, %r13, 6;
-	@%p16 bra 	BB21_34;
+	@%p16 bra 	BB19_34;
 
 	setp.eq.s32	%p17, %r13, 7;
-	@%p17 bra 	BB21_33;
-	bra.uni 	BB21_22;
+	@%p17 bra 	BB19_33;
+	bra.uni 	BB19_22;
 
-BB21_33:
+BB19_33:
 	setp.gt.f64	%p29, %fd1, %fd2;
 	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p29;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_15:
+BB19_15:
 	setp.eq.s32	%p20, %r13, 3;
-	@%p20 bra 	BB21_51;
+	@%p20 bra 	BB19_51;
 
 	setp.eq.s32	%p21, %r13, 4;
-	@%p21 bra 	BB21_35;
-	bra.uni 	BB21_17;
+	@%p21 bra 	BB19_35;
+	bra.uni 	BB19_17;
 
-BB21_35:
+BB19_35:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r8}, %fd1;
@@ -2507,7 +2206,7 @@ BB21_35:
 	shl.b64 	%rd1, %rd11, %r22;
 	setp.eq.s64	%p32, %rd1, -9223372036854775808;
 	abs.f64 	%fd11, %fd1;
-	// Callseq Start 1
+	// Callseq Start 0
 	{
 	.reg .b32 temp_param_reg;
 	// <end>}
@@ -2525,13 +2224,13 @@ BB21_35:
 	ld.param.f64	%fd38, [retval0+0];
 	
 	//{
-	}// Callseq End 1
+	}// Callseq End 0
 	setp.lt.s32	%p33, %r8, 0;
 	and.pred  	%p1, %p33, %p32;
-	@!%p1 bra 	BB21_37;
-	bra.uni 	BB21_36;
+	@!%p1 bra 	BB19_37;
+	bra.uni 	BB19_36;
 
-BB21_36:
+BB19_36:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r23}, %fd38;
@@ -2543,111 +2242,111 @@ BB21_36:
 	}
 	mov.b64 	%fd38, {%r25, %r24};
 
-BB21_37:
+BB19_37:
 	mov.f64 	%fd37, %fd38;
 	setp.eq.f64	%p34, %fd1, 0d0000000000000000;
-	@%p34 bra 	BB21_40;
-	bra.uni 	BB21_38;
+	@%p34 bra 	BB19_40;
+	bra.uni 	BB19_38;
 
-BB21_40:
+BB19_40:
 	selp.b32	%r26, %r8, 0, %p32;
 	or.b32  	%r27, %r26, 2146435072;
 	setp.lt.s32	%p38, %r9, 0;
 	selp.b32	%r28, %r27, %r26, %p38;
 	mov.u32 	%r29, 0;
 	mov.b64 	%fd37, {%r29, %r28};
-	bra.uni 	BB21_41;
+	bra.uni 	BB19_41;
 
-BB21_24:
+BB19_24:
 	setp.gt.s32	%p11, %r13, 10;
-	@%p11 bra 	BB21_28;
+	@%p11 bra 	BB19_28;
 
 	setp.eq.s32	%p14, %r13, 9;
-	@%p14 bra 	BB21_32;
-	bra.uni 	BB21_26;
+	@%p14 bra 	BB19_32;
+	bra.uni 	BB19_26;
 
-BB21_32:
+BB19_32:
 	setp.eq.f64	%p27, %fd1, %fd2;
 	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p27;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_28:
+BB19_28:
 	setp.eq.s32	%p12, %r13, 11;
-	@%p12 bra 	BB21_31;
-	bra.uni 	BB21_29;
+	@%p12 bra 	BB19_31;
+	bra.uni 	BB19_29;
 
-BB21_31:
+BB19_31:
 	min.f64 	%fd39, %fd1, %fd2;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_53:
+BB19_53:
 	add.f64 	%fd39, %fd1, %fd2;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_13:
+BB19_13:
 	setp.eq.s32	%p25, %r13, 2;
-	@%p25 bra 	BB21_14;
-	bra.uni 	BB21_54;
+	@%p25 bra 	BB19_14;
+	bra.uni 	BB19_54;
 
-BB21_14:
+BB19_14:
 	mul.f64 	%fd39, %fd1, %fd2;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_34:
+BB19_34:
 	setp.le.f64	%p30, %fd1, %fd2;
 	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p30;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_22:
+BB19_22:
 	setp.eq.s32	%p18, %r13, 8;
-	@%p18 bra 	BB21_23;
-	bra.uni 	BB21_54;
+	@%p18 bra 	BB19_23;
+	bra.uni 	BB19_54;
 
-BB21_23:
+BB19_23:
 	setp.ge.f64	%p28, %fd1, %fd2;
 	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p28;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_51:
+BB19_51:
 	div.rn.f64 	%fd39, %fd1, %fd2;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_17:
+BB19_17:
 	setp.eq.s32	%p22, %r13, 5;
-	@%p22 bra 	BB21_18;
-	bra.uni 	BB21_54;
+	@%p22 bra 	BB19_18;
+	bra.uni 	BB19_54;
 
-BB21_18:
+BB19_18:
 	setp.lt.f64	%p31, %fd1, %fd2;
 	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p31;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_26:
+BB19_26:
 	setp.eq.s32	%p15, %r13, 10;
-	@%p15 bra 	BB21_27;
-	bra.uni 	BB21_54;
+	@%p15 bra 	BB19_27;
+	bra.uni 	BB19_54;
 
-BB21_27:
+BB19_27:
 	setp.neu.f64	%p26, %fd1, %fd2;
 	selp.f64	%fd39, 0d3FF0000000000000, 0d0000000000000000, %p26;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_29:
+BB19_29:
 	setp.ne.s32	%p13, %r13, 12;
-	@%p13 bra 	BB21_54;
+	@%p13 bra 	BB19_54;
 
 	max.f64 	%fd39, %fd1, %fd2;
-	bra.uni 	BB21_54;
+	bra.uni 	BB19_54;
 
-BB21_38:
+BB19_38:
 	setp.gt.s32	%p35, %r8, -1;
-	@%p35 bra 	BB21_41;
+	@%p35 bra 	BB19_41;
 
 	cvt.rzi.f64.f64	%fd29, %fd2;
 	setp.neu.f64	%p36, %fd29, %fd2;
 	selp.f64	%fd37, 0dFFF8000000000000, %fd37, %p36;
 
-BB21_41:
+BB19_41:
 	mov.f64 	%fd17, %fd37;
 	add.f64 	%fd18, %fd1, %fd2;
 	{
@@ -2657,35 +2356,35 @@ BB21_41:
 	and.b32  	%r31, %r30, 2146435072;
 	setp.ne.s32	%p39, %r31, 2146435072;
 	mov.f64 	%fd36, %fd17;
-	@%p39 bra 	BB21_50;
+	@%p39 bra 	BB19_50;
 
 	setp.gtu.f64	%p40, %fd11, 0d7FF0000000000000;
 	mov.f64 	%fd36, %fd18;
-	@%p40 bra 	BB21_50;
+	@%p40 bra 	BB19_50;
 
 	abs.f64 	%fd30, %fd2;
 	setp.gtu.f64	%p41, %fd30, 0d7FF0000000000000;
 	mov.f64 	%fd35, %fd18;
 	mov.f64 	%fd36, %fd35;
-	@%p41 bra 	BB21_50;
+	@%p41 bra 	BB19_50;
 
 	and.b32  	%r32, %r9, 2147483647;
 	setp.ne.s32	%p42, %r32, 2146435072;
-	@%p42 bra 	BB21_46;
+	@%p42 bra 	BB19_46;
 
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%r33, %temp}, %fd2;
 	}
 	setp.eq.s32	%p43, %r33, 0;
-	@%p43 bra 	BB21_49;
+	@%p43 bra 	BB19_49;
 
-BB21_46:
+BB19_46:
 	and.b32  	%r34, %r8, 2147483647;
 	setp.ne.s32	%p44, %r34, 2146435072;
 	mov.f64 	%fd33, %fd17;
 	mov.f64 	%fd36, %fd33;
-	@%p44 bra 	BB21_50;
+	@%p44 bra 	BB19_50;
 
 	{
 	.reg .b32 %temp; 
@@ -2693,7 +2392,7 @@ BB21_46:
 	}
 	setp.ne.s32	%p45, %r35, 0;
 	mov.f64 	%fd36, %fd17;
-	@%p45 bra 	BB21_50;
+	@%p45 bra 	BB19_50;
 
 	shr.s32 	%r36, %r9, 31;
 	and.b32  	%r37, %r36, -2146435072;
@@ -2702,9 +2401,9 @@ BB21_46:
 	selp.b32	%r40, %r39, %r38, %p1;
 	mov.u32 	%r41, 0;
 	mov.b64 	%fd36, {%r41, %r40};
-	bra.uni 	BB21_50;
+	bra.uni 	BB19_50;
 
-BB21_49:
+BB19_49:
 	setp.gt.f64	%p46, %fd11, 0d3FF0000000000000;
 	selp.b32	%r42, 2146435072, 0, %p46;
 	xor.b32  	%r43, %r42, 2146435072;
@@ -2715,58 +2414,51 @@ BB21_49:
 	mov.u32 	%r46, 0;
 	mov.b64 	%fd36, {%r46, %r45};
 
-BB21_50:
+BB19_50:
 	setp.eq.f64	%p49, %fd2, 0d0000000000000000;
 	setp.eq.f64	%p50, %fd1, 0d3FF0000000000000;
 	or.pred  	%p51, %p50, %p49;
 	selp.f64	%fd39, 0d3FF0000000000000, %fd36, %p51;
 
-BB21_54:
+BB19_54:
 	cvta.to.global.u64 	%rd12, %rd4;
 	mul.wide.s32 	%rd13, %r3, 8;
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd39;
+	bar.sync 	0;
 
-BB21_55:
+BB19_55:
 	ret;
 }
 
-	// .globl	binCellScalarOp
-.visible .entry binCellScalarOp(
-	.param .u64 binCellScalarOp_param_0,
-	.param .f64 binCellScalarOp_param_1,
-	.param .u64 binCellScalarOp_param_2,
-	.param .u32 binCellScalarOp_param_3,
-	.param .u32 binCellScalarOp_param_4,
-	.param .u32 binCellScalarOp_param_5,
-	.param .u32 binCellScalarOp_param_6
+	// .globl	matrix_scalar_op
+.visible .entry matrix_scalar_op(
+	.param .u64 matrix_scalar_op_param_0,
+	.param .f64 matrix_scalar_op_param_1,
+	.param .u64 matrix_scalar_op_param_2,
+	.param .u32 matrix_scalar_op_param_3,
+	.param .u32 matrix_scalar_op_param_4,
+	.param .u32 matrix_scalar_op_param_5
 )
 {
-	.reg .pred 	%p<89>;
-	.reg .b32 	%r<71>;
+	.reg .pred 	%p<91>;
+	.reg .b32 	%r<64>;
 	.reg .f64 	%fd<77>;
 	.reg .b64 	%rd<12>;
 
 
-	ld.param.u64 	%rd4, [binCellScalarOp_param_0];
-	ld.param.f64 	%fd52, [binCellScalarOp_param_1];
-	ld.param.u64 	%rd5, [binCellScalarOp_param_2];
-	ld.param.u32 	%r8, [binCellScalarOp_param_3];
-	ld.param.u32 	%r9, [binCellScalarOp_param_4];
-	ld.param.u32 	%r6, [binCellScalarOp_param_5];
-	ld.param.u32 	%r7, [binCellScalarOp_param_6];
-	mov.u32 	%r10, %ctaid.x;
-	mov.u32 	%r11, %ntid.x;
-	mov.u32 	%r12, %tid.x;
-	mad.lo.s32 	%r13, %r11, %r10, %r12;
-	mov.u32 	%r14, %ntid.y;
-	mov.u32 	%r15, %ctaid.y;
-	mov.u32 	%r16, %tid.y;
-	mad.lo.s32 	%r17, %r13, %r9, %r16;
-	mad.lo.s32 	%r1, %r14, %r15, %r17;
-	mul.lo.s32 	%r18, %r9, %r8;
-	setp.ge.s32	%p3, %r1, %r18;
-	@%p3 bra 	BB22_92;
+	ld.param.u64 	%rd4, [matrix_scalar_op_param_0];
+	ld.param.f64 	%fd52, [matrix_scalar_op_param_1];
+	ld.param.u64 	%rd5, [matrix_scalar_op_param_2];
+	ld.param.u32 	%r8, [matrix_scalar_op_param_3];
+	ld.param.u32 	%r6, [matrix_scalar_op_param_4];
+	ld.param.u32 	%r7, [matrix_scalar_op_param_5];
+	mov.u32 	%r9, %ctaid.x;
+	mov.u32 	%r10, %ntid.x;
+	mov.u32 	%r11, %tid.x;
+	mad.lo.s32 	%r1, %r10, %r9, %r11;
+	setp.ge.s32	%p3, %r1, %r8;
+	@%p3 bra 	BB20_94;
 
 	cvta.to.global.u64 	%rd6, %rd5;
 	cvta.to.global.u64 	%rd7, %rd4;
@@ -2775,178 +2467,86 @@ BB21_55:
 	ld.global.f64 	%fd1, [%rd9];
 	add.s64 	%rd1, %rd6, %rd8;
 	setp.eq.s32	%p4, %r7, 0;
-	@%p4 bra 	BB22_47;
+	@%p4 bra 	BB20_48;
 
-	setp.eq.s32	%p5, %r6, 0;
-	@%p5 bra 	BB22_45;
+	mov.f64 	%fd67, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p5, %r6, 5;
+	@%p5 bra 	BB20_12;
 
-	mov.f64 	%fd67, 0dC08F380000000000;
-	setp.gt.s32	%p6, %r6, 6;
-	@%p6 bra 	BB22_13;
+	setp.gt.s32	%p15, %r6, 2;
+	@%p15 bra 	BB20_8;
 
-	setp.gt.s32	%p14, %r6, 3;
-	@%p14 bra 	BB22_9;
+	setp.eq.s32	%p19, %r6, 0;
+	@%p19 bra 	BB20_46;
 
-	setp.eq.s32	%p18, %r6, 1;
-	@%p18 bra 	BB22_44;
+	setp.eq.s32	%p20, %r6, 1;
+	@%p20 bra 	BB20_45;
+	bra.uni 	BB20_6;
 
-	setp.eq.s32	%p19, %r6, 2;
-	@%p19 bra 	BB22_43;
-	bra.uni 	BB22_7;
+BB20_45:
+	sub.f64 	%fd67, %fd52, %fd1;
+	bra.uni 	BB20_47;
 
-BB22_43:
-	mul.f64 	%fd67, %fd1, %fd52;
-	bra.uni 	BB22_46;
-
-BB22_47:
-	setp.eq.s32	%p47, %r6, 0;
-	@%p47 bra 	BB22_90;
-
-	mov.f64 	%fd76, 0dC08F380000000000;
-	setp.gt.s32	%p48, %r6, 6;
-	@%p48 bra 	BB22_58;
-
-	setp.gt.s32	%p56, %r6, 3;
-	@%p56 bra 	BB22_54;
-
-	setp.eq.s32	%p60, %r6, 1;
-	@%p60 bra 	BB22_89;
-
-	setp.eq.s32	%p61, %r6, 2;
-	@%p61 bra 	BB22_88;
-	bra.uni 	BB22_52;
-
-BB22_88:
-	mul.f64 	%fd76, %fd1, %fd52;
-	bra.uni 	BB22_91;
-
-BB22_45:
-	add.f64 	%fd67, %fd1, %fd52;
-
-BB22_46:
-	st.global.f64 	[%rd1], %fd67;
-	bra.uni 	BB22_92;
-
-BB22_13:
-	setp.gt.s32	%p7, %r6, 9;
-	@%p7 bra 	BB22_18;
-
-	setp.eq.s32	%p11, %r6, 7;
-	@%p11 bra 	BB22_25;
-
-	setp.eq.s32	%p12, %r6, 8;
-	@%p12 bra 	BB22_24;
-	bra.uni 	BB22_16;
-
-BB22_24:
-	setp.le.f64	%p23, %fd1, %fd52;
-	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
-	bra.uni 	BB22_46;
-
-BB22_90:
-	add.f64 	%fd76, %fd1, %fd52;
-
-BB22_91:
-	st.global.f64 	[%rd1], %fd76;
-
-BB22_92:
-	ret;
-
-BB22_58:
-	setp.gt.s32	%p49, %r6, 9;
-	@%p49 bra 	BB22_63;
+BB20_48:
+	mov.f64 	%fd76, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p48, %r6, 5;
+	@%p48 bra 	BB20_58;
 
-	setp.eq.s32	%p53, %r6, 7;
-	@%p53 bra 	BB22_70;
+	setp.gt.s32	%p58, %r6, 2;
+	@%p58 bra 	BB20_54;
 
-	setp.eq.s32	%p54, %r6, 8;
-	@%p54 bra 	BB22_69;
-	bra.uni 	BB22_61;
+	setp.eq.s32	%p62, %r6, 0;
+	@%p62 bra 	BB20_92;
 
-BB22_69:
-	setp.ge.f64	%p65, %fd1, %fd52;
-	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
-	bra.uni 	BB22_91;
+	setp.eq.s32	%p63, %r6, 1;
+	@%p63 bra 	BB20_91;
+	bra.uni 	BB20_52;
 
-BB22_9:
-	setp.eq.s32	%p15, %r6, 4;
-	@%p15 bra 	BB22_27;
+BB20_91:
+	sub.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB20_93;
 
-	setp.eq.s32	%p16, %r6, 5;
-	@%p16 bra 	BB22_26;
-	bra.uni 	BB22_11;
+BB20_12:
+	setp.gt.s32	%p6, %r6, 8;
+	@%p6 bra 	BB20_17;
 
-BB22_26:
-	setp.gt.f64	%p26, %fd1, %fd52;
-	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
-	bra.uni 	BB22_46;
+	setp.eq.s32	%p12, %r6, 6;
+	@%p12 bra 	BB20_27;
 
-BB22_18:
-	setp.eq.s32	%p8, %r6, 10;
-	@%p8 bra 	BB22_23;
+	setp.eq.s32	%p13, %r6, 7;
+	@%p13 bra 	BB20_26;
+	bra.uni 	BB20_15;
 
-	setp.eq.s32	%p9, %r6, 11;
-	@%p9 bra 	BB22_22;
-	bra.uni 	BB22_20;
+BB20_26:
+	setp.lt.f64	%p25, %fd1, %fd52;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
+	bra.uni 	BB20_47;
 
-BB22_22:
-	min.f64 	%fd67, %fd52, %fd1;
-	bra.uni 	BB22_46;
+BB20_58:
+	setp.gt.s32	%p49, %r6, 8;
+	@%p49 bra 	BB20_63;
 
-BB22_54:
-	setp.eq.s32	%p57, %r6, 4;
-	@%p57 bra 	BB22_72;
+	setp.eq.s32	%p55, %r6, 6;
+	@%p55 bra 	BB20_73;
 
-	setp.eq.s32	%p58, %r6, 5;
-	@%p58 bra 	BB22_71;
-	bra.uni 	BB22_56;
+	setp.eq.s32	%p56, %r6, 7;
+	@%p56 bra 	BB20_72;
+	bra.uni 	BB20_61;
 
-BB22_71:
-	setp.lt.f64	%p68, %fd1, %fd52;
+BB20_72:
+	setp.gt.f64	%p68, %fd1, %fd52;
 	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p68;
-	bra.uni 	BB22_91;
+	bra.uni 	BB20_93;
 
-BB22_63:
-	setp.eq.s32	%p50, %r6, 10;
-	@%p50 bra 	BB22_68;
+BB20_8:
+	setp.eq.s32	%p16, %r6, 3;
+	@%p16 bra 	BB20_44;
 
-	setp.eq.s32	%p51, %r6, 11;
-	@%p51 bra 	BB22_67;
-	bra.uni 	BB22_65;
-
-BB22_67:
-	min.f64 	%fd76, %fd1, %fd52;
-	bra.uni 	BB22_91;
-
-BB22_44:
-	sub.f64 	%fd67, %fd52, %fd1;
-	bra.uni 	BB22_46;
-
-BB22_7:
-	setp.eq.s32	%p20, %r6, 3;
-	@%p20 bra 	BB22_8;
-	bra.uni 	BB22_46;
+	setp.eq.s32	%p17, %r6, 4;
+	@%p17 bra 	BB20_28;
+	bra.uni 	BB20_10;
 
-BB22_8:
-	div.rn.f64 	%fd67, %fd52, %fd1;
-	bra.uni 	BB22_46;
-
-BB22_25:
-	setp.lt.f64	%p24, %fd1, %fd52;
-	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
-	bra.uni 	BB22_46;
-
-BB22_16:
-	setp.eq.s32	%p13, %r6, 9;
-	@%p13 bra 	BB22_17;
-	bra.uni 	BB22_46;
-
-BB22_17:
-	setp.eq.f64	%p22, %fd1, %fd52;
-	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
-	bra.uni 	BB22_46;
-
-BB22_27:
+BB20_28:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r2}, %fd52;
@@ -2955,13 +2555,13 @@ BB22_27:
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r3}, %fd1;
 	}
-	bfe.u32 	%r19, %r3, 20, 11;
-	add.s32 	%r20, %r19, -1012;
+	bfe.u32 	%r12, %r3, 20, 11;
+	add.s32 	%r13, %r12, -1012;
 	mov.b64 	 %rd10, %fd1;
-	shl.b64 	%rd2, %rd10, %r20;
-	setp.eq.s64	%p27, %rd2, -9223372036854775808;
+	shl.b64 	%rd2, %rd10, %r13;
+	setp.eq.s64	%p28, %rd2, -9223372036854775808;
 	abs.f64 	%fd10, %fd52;
-	// Callseq Start 2
+	// Callseq Start 1
 	{
 	.reg .b32 temp_param_reg;
 	// <end>}
@@ -2979,90 +2579,61 @@ BB22_27:
 	ld.param.f64	%fd66, [retval0+0];
 	
 	//{
-	}// Callseq End 2
-	setp.lt.s32	%p28, %r2, 0;
-	and.pred  	%p1, %p28, %p27;
-	@!%p1 bra 	BB22_29;
-	bra.uni 	BB22_28;
+	}// Callseq End 1
+	setp.lt.s32	%p29, %r2, 0;
+	and.pred  	%p1, %p29, %p28;
+	@!%p1 bra 	BB20_30;
+	bra.uni 	BB20_29;
 
-BB22_28:
+BB20_29:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r21}, %fd66;
+	mov.b64 	{%temp, %r14}, %fd66;
 	}
-	xor.b32  	%r22, %r21, -2147483648;
+	xor.b32  	%r15, %r14, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r23, %temp}, %fd66;
+	mov.b64 	{%r16, %temp}, %fd66;
 	}
-	mov.b64 	%fd66, {%r23, %r22};
+	mov.b64 	%fd66, {%r16, %r15};
 
-BB22_29:
+BB20_30:
 	mov.f64 	%fd65, %fd66;
-	setp.eq.f64	%p29, %fd52, 0d0000000000000000;
-	@%p29 bra 	BB22_32;
-	bra.uni 	BB22_30;
-
-BB22_32:
-	selp.b32	%r24, %r2, 0, %p27;
-	or.b32  	%r25, %r24, 2146435072;
-	setp.lt.s32	%p33, %r3, 0;
-	selp.b32	%r26, %r25, %r24, %p33;
-	mov.u32 	%r27, 0;
-	mov.b64 	%fd65, {%r27, %r26};
-	bra.uni 	BB22_33;
-
-BB22_11:
-	setp.eq.s32	%p17, %r6, 6;
-	@%p17 bra 	BB22_12;
-	bra.uni 	BB22_46;
-
-BB22_12:
-	setp.ge.f64	%p25, %fd1, %fd52;
-	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
-	bra.uni 	BB22_46;
-
-BB22_23:
-	setp.neu.f64	%p21, %fd1, %fd52;
-	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p21;
-	bra.uni 	BB22_46;
-
-BB22_20:
-	setp.ne.s32	%p10, %r6, 12;
-	@%p10 bra 	BB22_46;
-
-	max.f64 	%fd67, %fd52, %fd1;
-	bra.uni 	BB22_46;
-
-BB22_89:
-	sub.f64 	%fd76, %fd1, %fd52;
-	bra.uni 	BB22_91;
-
-BB22_52:
-	setp.eq.s32	%p62, %r6, 3;
-	@%p62 bra 	BB22_53;
-	bra.uni 	BB22_91;
-
-BB22_53:
-	div.rn.f64 	%fd76, %fd1, %fd52;
-	bra.uni 	BB22_91;
-
-BB22_70:
-	setp.gt.f64	%p66, %fd1, %fd52;
-	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
-	bra.uni 	BB22_91;
+	setp.eq.f64	%p30, %fd52, 0d0000000000000000;
+	@%p30 bra 	BB20_33;
+	bra.uni 	BB20_31;
+
+BB20_33:
+	selp.b32	%r17, %r2, 0, %p28;
+	or.b32  	%r18, %r17, 2146435072;
+	setp.lt.s32	%p34, %r3, 0;
+	selp.b32	%r19, %r18, %r17, %p34;
+	mov.u32 	%r20, 0;
+	mov.b64 	%fd65, {%r20, %r19};
+	bra.uni 	BB20_34;
+
+BB20_17:
+	setp.gt.s32	%p7, %r6, 10;
+	@%p7 bra 	BB20_21;
+
+	setp.eq.s32	%p10, %r6, 9;
+	@%p10 bra 	BB20_25;
+	bra.uni 	BB20_19;
+
+BB20_25:
+	setp.eq.f64	%p23, %fd1, %fd52;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
+	bra.uni 	BB20_47;
 
-BB22_61:
-	setp.eq.s32	%p55, %r6, 9;
-	@%p55 bra 	BB22_62;
-	bra.uni 	BB22_91;
+BB20_54:
+	setp.eq.s32	%p59, %r6, 3;
+	@%p59 bra 	BB20_90;
 
-BB22_62:
-	setp.eq.f64	%p64, %fd1, %fd52;
-	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p64;
-	bra.uni 	BB22_91;
+	setp.eq.s32	%p60, %r6, 4;
+	@%p60 bra 	BB20_74;
+	bra.uni 	BB20_56;
 
-BB22_72:
+BB20_74:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r4}, %fd1;
@@ -3071,13 +2642,13 @@ BB22_72:
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r5}, %fd52;
 	}
-	bfe.u32 	%r45, %r5, 20, 11;
-	add.s32 	%r46, %r45, -1012;
+	bfe.u32 	%r38, %r5, 20, 11;
+	add.s32 	%r39, %r38, -1012;
 	mov.b64 	 %rd11, %fd52;
-	shl.b64 	%rd3, %rd11, %r46;
-	setp.eq.s64	%p69, %rd3, -9223372036854775808;
+	shl.b64 	%rd3, %rd11, %r39;
+	setp.eq.s64	%p71, %rd3, -9223372036854775808;
 	abs.f64 	%fd35, %fd1;
-	// Callseq Start 3
+	// Callseq Start 2
 	{
 	.reg .b32 temp_param_reg;
 	// <end>}
@@ -3095,226 +2666,362 @@ BB22_72:
 	ld.param.f64	%fd75, [retval0+0];
 	
 	//{
-	}// Callseq End 3
-	setp.lt.s32	%p70, %r4, 0;
-	and.pred  	%p2, %p70, %p69;
-	@!%p2 bra 	BB22_74;
-	bra.uni 	BB22_73;
+	}// Callseq End 2
+	setp.lt.s32	%p72, %r4, 0;
+	and.pred  	%p2, %p72, %p71;
+	@!%p2 bra 	BB20_76;
+	bra.uni 	BB20_75;
 
-BB22_73:
+BB20_75:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r47}, %fd75;
+	mov.b64 	{%temp, %r40}, %fd75;
 	}
-	xor.b32  	%r48, %r47, -2147483648;
+	xor.b32  	%r41, %r40, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r49, %temp}, %fd75;
+	mov.b64 	{%r42, %temp}, %fd75;
 	}
-	mov.b64 	%fd75, {%r49, %r48};
+	mov.b64 	%fd75, {%r42, %r41};
 
-BB22_74:
+BB20_76:
 	mov.f64 	%fd74, %fd75;
-	setp.eq.f64	%p71, %fd1, 0d0000000000000000;
-	@%p71 bra 	BB22_77;
-	bra.uni 	BB22_75;
-
-BB22_77:
-	selp.b32	%r50, %r4, 0, %p69;
-	or.b32  	%r51, %r50, 2146435072;
-	setp.lt.s32	%p75, %r5, 0;
-	selp.b32	%r52, %r51, %r50, %p75;
-	mov.u32 	%r53, 0;
-	mov.b64 	%fd74, {%r53, %r52};
-	bra.uni 	BB22_78;
-
-BB22_56:
-	setp.eq.s32	%p59, %r6, 6;
-	@%p59 bra 	BB22_57;
-	bra.uni 	BB22_91;
-
-BB22_57:
-	setp.le.f64	%p67, %fd1, %fd52;
+	setp.eq.f64	%p73, %fd1, 0d0000000000000000;
+	@%p73 bra 	BB20_79;
+	bra.uni 	BB20_77;
+
+BB20_79:
+	selp.b32	%r43, %r4, 0, %p71;
+	or.b32  	%r44, %r43, 2146435072;
+	setp.lt.s32	%p77, %r5, 0;
+	selp.b32	%r45, %r44, %r43, %p77;
+	mov.u32 	%r46, 0;
+	mov.b64 	%fd74, {%r46, %r45};
+	bra.uni 	BB20_80;
+
+BB20_63:
+	setp.gt.s32	%p50, %r6, 10;
+	@%p50 bra 	BB20_67;
+
+	setp.eq.s32	%p53, %r6, 9;
+	@%p53 bra 	BB20_71;
+	bra.uni 	BB20_65;
+
+BB20_71:
+	setp.eq.f64	%p66, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
+	bra.uni 	BB20_93;
+
+BB20_21:
+	setp.eq.s32	%p8, %r6, 11;
+	@%p8 bra 	BB20_24;
+	bra.uni 	BB20_22;
+
+BB20_24:
+	min.f64 	%fd67, %fd52, %fd1;
+	bra.uni 	BB20_47;
+
+BB20_46:
+	add.f64 	%fd67, %fd1, %fd52;
+	bra.uni 	BB20_47;
+
+BB20_6:
+	setp.eq.s32	%p21, %r6, 2;
+	@%p21 bra 	BB20_7;
+	bra.uni 	BB20_47;
+
+BB20_7:
+	mul.f64 	%fd67, %fd1, %fd52;
+	bra.uni 	BB20_47;
+
+BB20_27:
+	setp.ge.f64	%p26, %fd1, %fd52;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
+	bra.uni 	BB20_47;
+
+BB20_15:
+	setp.eq.s32	%p14, %r6, 8;
+	@%p14 bra 	BB20_16;
+	bra.uni 	BB20_47;
+
+BB20_16:
+	setp.le.f64	%p24, %fd1, %fd52;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
+	bra.uni 	BB20_47;
+
+BB20_44:
+	div.rn.f64 	%fd67, %fd52, %fd1;
+	bra.uni 	BB20_47;
+
+BB20_10:
+	setp.eq.s32	%p18, %r6, 5;
+	@%p18 bra 	BB20_11;
+	bra.uni 	BB20_47;
+
+BB20_11:
+	setp.gt.f64	%p27, %fd1, %fd52;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p27;
+	bra.uni 	BB20_47;
+
+BB20_67:
+	setp.eq.s32	%p51, %r6, 11;
+	@%p51 bra 	BB20_70;
+	bra.uni 	BB20_68;
+
+BB20_70:
+	min.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB20_93;
+
+BB20_19:
+	setp.eq.s32	%p11, %r6, 10;
+	@%p11 bra 	BB20_20;
+	bra.uni 	BB20_47;
+
+BB20_20:
+	setp.neu.f64	%p22, %fd1, %fd52;
+	selp.f64	%fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
+	bra.uni 	BB20_47;
+
+BB20_22:
+	setp.ne.s32	%p9, %r6, 12;
+	@%p9 bra 	BB20_47;
+
+	max.f64 	%fd67, %fd52, %fd1;
+	bra.uni 	BB20_47;
+
+BB20_92:
+	add.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB20_93;
+
+BB20_52:
+	setp.eq.s32	%p64, %r6, 2;
+	@%p64 bra 	BB20_53;
+	bra.uni 	BB20_93;
+
+BB20_53:
+	mul.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB20_93;
+
+BB20_73:
+	setp.le.f64	%p69, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p69;
+	bra.uni 	BB20_93;
+
+BB20_61:
+	setp.eq.s32	%p57, %r6, 8;
+	@%p57 bra 	BB20_62;
+	bra.uni 	BB20_93;
+
+BB20_62:
+	setp.ge.f64	%p67, %fd1, %fd52;
 	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p67;
-	bra.uni 	BB22_91;
+	bra.uni 	BB20_93;
+
+BB20_90:
+	div.rn.f64 	%fd76, %fd1, %fd52;
+	bra.uni 	BB20_93;
+
+BB20_56:
+	setp.eq.s32	%p61, %r6, 5;
+	@%p61 bra 	BB20_57;
+	bra.uni 	BB20_93;
 
-BB22_68:
-	setp.neu.f64	%p63, %fd1, %fd52;
-	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p63;
-	bra.uni 	BB22_91;
+BB20_57:
+	setp.lt.f64	%p70, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p70;
+	bra.uni 	BB20_93;
 
-BB22_65:
+BB20_65:
+	setp.eq.s32	%p54, %r6, 10;
+	@%p54 bra 	BB20_66;
+	bra.uni 	BB20_93;
+
+BB20_66:
+	setp.neu.f64	%p65, %fd1, %fd52;
+	selp.f64	%fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
+	bra.uni 	BB20_93;
+
+BB20_68:
 	setp.ne.s32	%p52, %r6, 12;
-	@%p52 bra 	BB22_91;
+	@%p52 bra 	BB20_93;
 
 	max.f64 	%fd76, %fd1, %fd52;
-	bra.uni 	BB22_91;
+	bra.uni 	BB20_93;
 
-BB22_30:
-	setp.gt.s32	%p30, %r2, -1;
-	@%p30 bra 	BB22_33;
+BB20_31:
+	setp.gt.s32	%p31, %r2, -1;
+	@%p31 bra 	BB20_34;
 
 	cvt.rzi.f64.f64	%fd54, %fd1;
-	setp.neu.f64	%p31, %fd54, %fd1;
-	selp.f64	%fd65, 0dFFF8000000000000, %fd65, %p31;
+	setp.neu.f64	%p32, %fd54, %fd1;
+	selp.f64	%fd65, 0dFFF8000000000000, %fd65, %p32;
 
-BB22_33:
+BB20_34:
 	mov.f64 	%fd16, %fd65;
 	add.f64 	%fd17, %fd1, %fd52;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r28}, %fd17;
+	mov.b64 	{%temp, %r21}, %fd17;
 	}
-	and.b32  	%r29, %r28, 2146435072;
-	setp.ne.s32	%p34, %r29, 2146435072;
+	and.b32  	%r22, %r21, 2146435072;
+	setp.ne.s32	%p35, %r22, 2146435072;
 	mov.f64 	%fd64, %fd16;
-	@%p34 bra 	BB22_42;
+	@%p35 bra 	BB20_43;
 
-	setp.gtu.f64	%p35, %fd10, 0d7FF0000000000000;
+	setp.gtu.f64	%p36, %fd10, 0d7FF0000000000000;
 	mov.f64 	%fd64, %fd17;
-	@%p35 bra 	BB22_42;
+	@%p36 bra 	BB20_43;
 
 	abs.f64 	%fd55, %fd1;
-	setp.gtu.f64	%p36, %fd55, 0d7FF0000000000000;
+	setp.gtu.f64	%p37, %fd55, 0d7FF0000000000000;
 	mov.f64 	%fd63, %fd17;
 	mov.f64 	%fd64, %fd63;
-	@%p36 bra 	BB22_42;
+	@%p37 bra 	BB20_43;
 
-	and.b32  	%r30, %r3, 2147483647;
-	setp.ne.s32	%p37, %r30, 2146435072;
-	@%p37 bra 	BB22_38;
+	and.b32  	%r23, %r3, 2147483647;
+	setp.ne.s32	%p38, %r23, 2146435072;
+	@%p38 bra 	BB20_39;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r31, %temp}, %fd1;
+	mov.b64 	{%r24, %temp}, %fd1;
 	}
-	setp.eq.s32	%p38, %r31, 0;
-	@%p38 bra 	BB22_41;
+	setp.eq.s32	%p39, %r24, 0;
+	@%p39 bra 	BB20_42;
 
-BB22_38:
-	and.b32  	%r32, %r2, 2147483647;
-	setp.ne.s32	%p39, %r32, 2146435072;
+BB20_39:
+	and.b32  	%r25, %r2, 2147483647;
+	setp.ne.s32	%p40, %r25, 2146435072;
 	mov.f64 	%fd61, %fd16;
 	mov.f64 	%fd64, %fd61;
-	@%p39 bra 	BB22_42;
+	@%p40 bra 	BB20_43;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r33, %temp}, %fd52;
+	mov.b64 	{%r26, %temp}, %fd52;
 	}
-	setp.ne.s32	%p40, %r33, 0;
+	setp.ne.s32	%p41, %r26, 0;
 	mov.f64 	%fd64, %fd16;
-	@%p40 bra 	BB22_42;
+	@%p41 bra 	BB20_43;
 
-	shr.s32 	%r34, %r3, 31;
-	and.b32  	%r35, %r34, -2146435072;
-	add.s32 	%r36, %r35, 2146435072;
-	or.b32  	%r37, %r36, -2147483648;
-	selp.b32	%r38, %r37, %r36, %p1;
-	mov.u32 	%r39, 0;
-	mov.b64 	%fd64, {%r39, %r38};
-	bra.uni 	BB22_42;
+	shr.s32 	%r27, %r3, 31;
+	and.b32  	%r28, %r27, -2146435072;
+	add.s32 	%r29, %r28, 2146435072;
+	or.b32  	%r30, %r29, -2147483648;
+	selp.b32	%r31, %r30, %r29, %p1;
+	mov.u32 	%r32, 0;
+	mov.b64 	%fd64, {%r32, %r31};
+	bra.uni 	BB20_43;
 
-BB22_75:
-	setp.gt.s32	%p72, %r4, -1;
-	@%p72 bra 	BB22_78;
+BB20_77:
+	setp.gt.s32	%p74, %r4, -1;
+	@%p74 bra 	BB20_80;
 
 	cvt.rzi.f64.f64	%fd57, %fd52;
-	setp.neu.f64	%p73, %fd57, %fd52;
-	selp.f64	%fd74, 0dFFF8000000000000, %fd74, %p73;
+	setp.neu.f64	%p75, %fd57, %fd52;
+	selp.f64	%fd74, 0dFFF8000000000000, %fd74, %p75;
 
-BB22_78:
+BB20_80:
 	mov.f64 	%fd41, %fd74;
 	add.f64 	%fd42, %fd1, %fd52;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r54}, %fd42;
+	mov.b64 	{%temp, %r47}, %fd42;
 	}
-	and.b32  	%r55, %r54, 2146435072;
-	setp.ne.s32	%p76, %r55, 2146435072;
+	and.b32  	%r48, %r47, 2146435072;
+	setp.ne.s32	%p78, %r48, 2146435072;
 	mov.f64 	%fd73, %fd41;
-	@%p76 bra 	BB22_87;
+	@%p78 bra 	BB20_89;
 
-	setp.gtu.f64	%p77, %fd35, 0d7FF0000000000000;
+	setp.gtu.f64	%p79, %fd35, 0d7FF0000000000000;
 	mov.f64 	%fd73, %fd42;
-	@%p77 bra 	BB22_87;
+	@%p79 bra 	BB20_89;
 
 	abs.f64 	%fd58, %fd52;
-	setp.gtu.f64	%p78, %fd58, 0d7FF0000000000000;
+	setp.gtu.f64	%p80, %fd58, 0d7FF0000000000000;
 	mov.f64 	%fd72, %fd42;
 	mov.f64 	%fd73, %fd72;
-	@%p78 bra 	BB22_87;
+	@%p80 bra 	BB20_89;
 
-	and.b32  	%r56, %r5, 2147483647;
-	setp.ne.s32	%p79, %r56, 2146435072;
-	@%p79 bra 	BB22_83;
+	and.b32  	%r49, %r5, 2147483647;
+	setp.ne.s32	%p81, %r49, 2146435072;
+	@%p81 bra 	BB20_85;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r57, %temp}, %fd52;
+	mov.b64 	{%r50, %temp}, %fd52;
 	}
-	setp.eq.s32	%p80, %r57, 0;
-	@%p80 bra 	BB22_86;
+	setp.eq.s32	%p82, %r50, 0;
+	@%p82 bra 	BB20_88;
 
-BB22_83:
-	and.b32  	%r58, %r4, 2147483647;
-	setp.ne.s32	%p81, %r58, 2146435072;
+BB20_85:
+	and.b32  	%r51, %r4, 2147483647;
+	setp.ne.s32	%p83, %r51, 2146435072;
 	mov.f64 	%fd70, %fd41;
 	mov.f64 	%fd73, %fd70;
-	@%p81 bra 	BB22_87;
+	@%p83 bra 	BB20_89;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r59, %temp}, %fd1;
+	mov.b64 	{%r52, %temp}, %fd1;
 	}
-	setp.ne.s32	%p82, %r59, 0;
+	setp.ne.s32	%p84, %r52, 0;
 	mov.f64 	%fd73, %fd41;
-	@%p82 bra 	BB22_87;
-
-	shr.s32 	%r60, %r5, 31;
-	and.b32  	%r61, %r60, -2146435072;
-	add.s32 	%r62, %r61, 2146435072;
-	or.b32  	%r63, %r62, -2147483648;
-	selp.b32	%r64, %r63, %r62, %p2;
-	mov.u32 	%r65, 0;
-	mov.b64 	%fd73, {%r65, %r64};
-	bra.uni 	BB22_87;
-
-BB22_41:
-	setp.gt.f64	%p41, %fd10, 0d3FF0000000000000;
-	selp.b32	%r40, 2146435072, 0, %p41;
-	xor.b32  	%r41, %r40, 2146435072;
-	setp.lt.s32	%p42, %r3, 0;
-	selp.b32	%r42, %r41, %r40, %p42;
-	setp.eq.f64	%p43, %fd52, 0dBFF0000000000000;
-	selp.b32	%r43, 1072693248, %r42, %p43;
-	mov.u32 	%r44, 0;
-	mov.b64 	%fd64, {%r44, %r43};
-
-BB22_42:
-	setp.eq.f64	%p44, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p45, %fd52, 0d3FF0000000000000;
-	or.pred  	%p46, %p45, %p44;
-	selp.f64	%fd67, 0d3FF0000000000000, %fd64, %p46;
-	bra.uni 	BB22_46;
-
-BB22_86:
-	setp.gt.f64	%p83, %fd35, 0d3FF0000000000000;
-	selp.b32	%r66, 2146435072, 0, %p83;
-	xor.b32  	%r67, %r66, 2146435072;
-	setp.lt.s32	%p84, %r5, 0;
-	selp.b32	%r68, %r67, %r66, %p84;
-	setp.eq.f64	%p85, %fd1, 0dBFF0000000000000;
-	selp.b32	%r69, 1072693248, %r68, %p85;
-	mov.u32 	%r70, 0;
-	mov.b64 	%fd73, {%r70, %r69};
-
-BB22_87:
-	setp.eq.f64	%p86, %fd52, 0d0000000000000000;
-	setp.eq.f64	%p87, %fd1, 0d3FF0000000000000;
-	or.pred  	%p88, %p87, %p86;
-	selp.f64	%fd76, 0d3FF0000000000000, %fd73, %p88;
-	bra.uni 	BB22_91;
+	@%p84 bra 	BB20_89;
+
+	shr.s32 	%r53, %r5, 31;
+	and.b32  	%r54, %r53, -2146435072;
+	add.s32 	%r55, %r54, 2146435072;
+	or.b32  	%r56, %r55, -2147483648;
+	selp.b32	%r57, %r56, %r55, %p2;
+	mov.u32 	%r58, 0;
+	mov.b64 	%fd73, {%r58, %r57};
+	bra.uni 	BB20_89;
+
+BB20_42:
+	setp.gt.f64	%p42, %fd10, 0d3FF0000000000000;
+	selp.b32	%r33, 2146435072, 0, %p42;
+	xor.b32  	%r34, %r33, 2146435072;
+	setp.lt.s32	%p43, %r3, 0;
+	selp.b32	%r35, %r34, %r33, %p43;
+	setp.eq.f64	%p44, %fd52, 0dBFF0000000000000;
+	selp.b32	%r36, 1072693248, %r35, %p44;
+	mov.u32 	%r37, 0;
+	mov.b64 	%fd64, {%r37, %r36};
+
+BB20_43:
+	setp.eq.f64	%p45, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p46, %fd52, 0d3FF0000000000000;
+	or.pred  	%p47, %p46, %p45;
+	selp.f64	%fd67, 0d3FF0000000000000, %fd64, %p47;
+
+BB20_47:
+	st.global.f64 	[%rd1], %fd67;
+	bra.uni 	BB20_94;
+
+BB20_88:
+	setp.gt.f64	%p85, %fd35, 0d3FF0000000000000;
+	selp.b32	%r59, 2146435072, 0, %p85;
+	xor.b32  	%r60, %r59, 2146435072;
+	setp.lt.s32	%p86, %r5, 0;
+	selp.b32	%r61, %r60, %r59, %p86;
+	setp.eq.f64	%p87, %fd1, 0dBFF0000000000000;
+	selp.b32	%r62, 1072693248, %r61, %p87;
+	mov.u32 	%r63, 0;
+	mov.b64 	%fd73, {%r63, %r62};
+
+BB20_89:
+	setp.eq.f64	%p88, %fd52, 0d0000000000000000;
+	setp.eq.f64	%p89, %fd1, 0d3FF0000000000000;
+	or.pred  	%p90, %p89, %p88;
+	selp.f64	%fd76, 0d3FF0000000000000, %fd73, %p90;
+
+BB20_93:
+	st.global.f64 	[%rd1], %fd76;
+
+BB20_94:
+	bar.sync 	0;
+	ret;
 }
 
 	// .globl	fill
@@ -3338,14 +3045,14 @@ BB22_87:
 	mov.u32 	%r5, %tid.x;
 	mad.lo.s32 	%r1, %r4, %r3, %r5;
 	setp.ge.s32	%p1, %r1, %r2;
-	@%p1 bra 	BB23_2;
+	@%p1 bra 	BB21_2;
 
 	cvta.to.global.u64 	%rd2, %rd1;
 	mul.wide.s32 	%rd3, %r1, 8;
 	add.s64 	%rd4, %rd2, %rd3;
 	st.global.f64 	[%rd4], %fd1;
 
-BB23_2:
+BB21_2:
 	ret;
 }
 
@@ -3373,9 +3080,9 @@ BB23_2:
 	mov.f64 	%fd76, 0d0000000000000000;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB24_4;
+	@%p1 bra 	BB22_4;
 
-BB24_1:
+BB22_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -3384,23 +3091,23 @@ BB24_1:
 	add.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB24_3;
+	@%p2 bra 	BB22_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	add.f64 	%fd78, %fd78, %fd31;
 
-BB24_3:
+BB22_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB24_1;
+	@%p3 bra 	BB22_1;
 
-BB24_4:
+BB22_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -3408,130 +3115,130 @@ BB24_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB24_8;
+	@%p4 bra 	BB22_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB24_7;
+	@%p5 bra 	BB22_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	add.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB24_7:
+BB22_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB24_8:
+BB22_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB24_12;
+	@%p6 bra 	BB22_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB24_11;
+	@%p7 bra 	BB22_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	add.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB24_11:
+BB22_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB24_12:
+BB22_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB24_16;
+	@%p8 bra 	BB22_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB24_15;
+	@%p9 bra 	BB22_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	add.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB24_15:
+BB22_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB24_16:
+BB22_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB24_20;
+	@%p10 bra 	BB22_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB24_19;
+	@%p11 bra 	BB22_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	add.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB24_19:
+BB22_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB24_20:
+BB22_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB24_33;
+	@%p12 bra 	BB22_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB24_23;
+	@%p13 bra 	BB22_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	add.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB24_23:
+BB22_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB24_25;
+	@%p14 bra 	BB22_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	add.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB24_25:
+BB22_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB24_27;
+	@%p15 bra 	BB22_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	add.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB24_27:
+BB22_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB24_29;
+	@%p16 bra 	BB22_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	add.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB24_29:
+BB22_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB24_31;
+	@%p17 bra 	BB22_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	add.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB24_31:
+BB22_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB24_33;
+	@%p18 bra 	BB22_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	add.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB24_33:
+BB22_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB24_35;
+	@%p19 bra 	BB22_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -3539,7 +3246,7 @@ BB24_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB24_35:
+BB22_35:
 	ret;
 }
 
@@ -3563,17 +3270,17 @@ BB24_35:
 	ld.param.u32 	%r4, [reduce_row_sum_param_3];
 	mov.u32 	%r6, %ctaid.x;
 	setp.ge.u32	%p1, %r6, %r5;
-	@%p1 bra 	BB25_35;
+	@%p1 bra 	BB23_35;
 
 	mov.u32 	%r38, %tid.x;
 	mov.f64 	%fd72, 0d0000000000000000;
 	mov.f64 	%fd73, %fd72;
 	setp.ge.u32	%p2, %r38, %r4;
-	@%p2 bra 	BB25_4;
+	@%p2 bra 	BB23_4;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 
-BB25_3:
+BB23_3:
 	mad.lo.s32 	%r8, %r6, %r4, %r38;
 	mul.wide.u32 	%rd4, %r8, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -3583,9 +3290,9 @@ BB25_3:
 	add.s32 	%r38, %r9, %r38;
 	setp.lt.u32	%p3, %r38, %r4;
 	mov.f64 	%fd72, %fd73;
-	@%p3 bra 	BB25_3;
+	@%p3 bra 	BB23_3;
 
-BB25_4:
+BB23_4:
 	mov.f64 	%fd70, %fd72;
 	mov.u32 	%r10, %tid.x;
 	mul.wide.u32 	%rd6, %r10, 8;
@@ -3595,130 +3302,130 @@ BB25_4:
 	bar.sync 	0;
 	mov.u32 	%r11, %ntid.x;
 	setp.lt.u32	%p4, %r11, 1024;
-	@%p4 bra 	BB25_8;
+	@%p4 bra 	BB23_8;
 
 	setp.gt.u32	%p5, %r10, 511;
 	mov.f64 	%fd71, %fd70;
-	@%p5 bra 	BB25_7;
+	@%p5 bra 	BB23_7;
 
 	ld.shared.f64 	%fd29, [%rd8+4096];
 	add.f64 	%fd71, %fd70, %fd29;
 	st.shared.f64 	[%rd8], %fd71;
 
-BB25_7:
+BB23_7:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB25_8:
+BB23_8:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p6, %r11, 512;
-	@%p6 bra 	BB25_12;
+	@%p6 bra 	BB23_12;
 
 	setp.gt.u32	%p7, %r10, 255;
 	mov.f64 	%fd69, %fd68;
-	@%p7 bra 	BB25_11;
+	@%p7 bra 	BB23_11;
 
 	ld.shared.f64 	%fd30, [%rd8+2048];
 	add.f64 	%fd69, %fd68, %fd30;
 	st.shared.f64 	[%rd8], %fd69;
 
-BB25_11:
+BB23_11:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB25_12:
+BB23_12:
 	mov.f64 	%fd66, %fd68;
 	setp.lt.u32	%p8, %r11, 256;
-	@%p8 bra 	BB25_16;
+	@%p8 bra 	BB23_16;
 
 	setp.gt.u32	%p9, %r10, 127;
 	mov.f64 	%fd67, %fd66;
-	@%p9 bra 	BB25_15;
+	@%p9 bra 	BB23_15;
 
 	ld.shared.f64 	%fd31, [%rd8+1024];
 	add.f64 	%fd67, %fd66, %fd31;
 	st.shared.f64 	[%rd8], %fd67;
 
-BB25_15:
+BB23_15:
 	mov.f64 	%fd66, %fd67;
 	bar.sync 	0;
 
-BB25_16:
+BB23_16:
 	mov.f64 	%fd64, %fd66;
 	setp.lt.u32	%p10, %r11, 128;
-	@%p10 bra 	BB25_20;
+	@%p10 bra 	BB23_20;
 
 	setp.gt.u32	%p11, %r10, 63;
 	mov.f64 	%fd65, %fd64;
-	@%p11 bra 	BB25_19;
+	@%p11 bra 	BB23_19;
 
 	ld.shared.f64 	%fd32, [%rd8+512];
 	add.f64 	%fd65, %fd64, %fd32;
 	st.shared.f64 	[%rd8], %fd65;
 
-BB25_19:
+BB23_19:
 	mov.f64 	%fd64, %fd65;
 	bar.sync 	0;
 
-BB25_20:
+BB23_20:
 	mov.f64 	%fd63, %fd64;
 	setp.gt.u32	%p12, %r10, 31;
-	@%p12 bra 	BB25_33;
+	@%p12 bra 	BB23_33;
 
 	setp.lt.u32	%p13, %r11, 64;
-	@%p13 bra 	BB25_23;
+	@%p13 bra 	BB23_23;
 
 	ld.volatile.shared.f64 	%fd33, [%rd8+256];
 	add.f64 	%fd63, %fd63, %fd33;
 	st.volatile.shared.f64 	[%rd8], %fd63;
 
-BB25_23:
+BB23_23:
 	mov.f64 	%fd62, %fd63;
 	setp.lt.u32	%p14, %r11, 32;
-	@%p14 bra 	BB25_25;
+	@%p14 bra 	BB23_25;
 
 	ld.volatile.shared.f64 	%fd34, [%rd8+128];
 	add.f64 	%fd62, %fd62, %fd34;
 	st.volatile.shared.f64 	[%rd8], %fd62;
 
-BB25_25:
+BB23_25:
 	mov.f64 	%fd61, %fd62;
 	setp.lt.u32	%p15, %r11, 16;
-	@%p15 bra 	BB25_27;
+	@%p15 bra 	BB23_27;
 
 	ld.volatile.shared.f64 	%fd35, [%rd8+64];
 	add.f64 	%fd61, %fd61, %fd35;
 	st.volatile.shared.f64 	[%rd8], %fd61;
 
-BB25_27:
+BB23_27:
 	mov.f64 	%fd60, %fd61;
 	setp.lt.u32	%p16, %r11, 8;
-	@%p16 bra 	BB25_29;
+	@%p16 bra 	BB23_29;
 
 	ld.volatile.shared.f64 	%fd36, [%rd8+32];
 	add.f64 	%fd60, %fd60, %fd36;
 	st.volatile.shared.f64 	[%rd8], %fd60;
 
-BB25_29:
+BB23_29:
 	mov.f64 	%fd59, %fd60;
 	setp.lt.u32	%p17, %r11, 4;
-	@%p17 bra 	BB25_31;
+	@%p17 bra 	BB23_31;
 
 	ld.volatile.shared.f64 	%fd37, [%rd8+16];
 	add.f64 	%fd59, %fd59, %fd37;
 	st.volatile.shared.f64 	[%rd8], %fd59;
 
-BB25_31:
+BB23_31:
 	setp.lt.u32	%p18, %r11, 2;
-	@%p18 bra 	BB25_33;
+	@%p18 bra 	BB23_33;
 
 	ld.volatile.shared.f64 	%fd38, [%rd8+8];
 	add.f64 	%fd39, %fd59, %fd38;
 	st.volatile.shared.f64 	[%rd8], %fd39;
 
-BB25_33:
+BB23_33:
 	setp.ne.s32	%p19, %r10, 0;
-	@%p19 bra 	BB25_35;
+	@%p19 bra 	BB23_35;
 
 	ld.shared.f64 	%fd40, [sdata];
 	cvta.to.global.u64 	%rd39, %rd2;
@@ -3726,7 +3433,7 @@ BB25_33:
 	add.s64 	%rd41, %rd39, %rd40;
 	st.global.f64 	[%rd41], %fd40;
 
-BB25_35:
+BB23_35:
 	ret;
 }
 
@@ -3753,18 +3460,18 @@ BB25_35:
 	mov.u32 	%r9, %tid.x;
 	mad.lo.s32 	%r1, %r7, %r8, %r9;
 	setp.ge.u32	%p1, %r1, %r6;
-	@%p1 bra 	BB26_5;
+	@%p1 bra 	BB24_5;
 
 	cvta.to.global.u64 	%rd1, %rd2;
 	mul.lo.s32 	%r2, %r6, %r5;
 	mov.f64 	%fd8, 0d0000000000000000;
 	mov.f64 	%fd9, %fd8;
 	setp.ge.u32	%p2, %r1, %r2;
-	@%p2 bra 	BB26_4;
+	@%p2 bra 	BB24_4;
 
 	mov.u32 	%r10, %r1;
 
-BB26_3:
+BB24_3:
 	mov.u32 	%r3, %r10;
 	mul.wide.u32 	%rd4, %r3, 8;
 	add.s64 	%rd5, %rd1, %rd4;
@@ -3774,15 +3481,15 @@ BB26_3:
 	setp.lt.u32	%p3, %r4, %r2;
 	mov.u32 	%r10, %r4;
 	mov.f64 	%fd8, %fd9;
-	@%p3 bra 	BB26_3;
+	@%p3 bra 	BB24_3;
 
-BB26_4:
+BB24_4:
 	cvta.to.global.u64 	%rd6, %rd3;
 	mul.wide.u32 	%rd7, %r1, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd8;
 
-BB26_5:
+BB24_5:
 	ret;
 }
 
@@ -3807,12 +3514,12 @@ BB26_5:
 	shl.b32 	%r8, %r7, 1;
 	mov.u32 	%r9, %ntid.x;
 	mad.lo.s32 	%r32, %r8, %r9, %r6;
-	mov.f64 	%fd76, 0d0010000000000000;
+	mov.f64 	%fd76, 0dFFEFFFFFFFFFFFFF;
 	mov.f64 	%fd77, %fd76;
 	setp.ge.u32	%p1, %r32, %r5;
-	@%p1 bra 	BB27_4;
+	@%p1 bra 	BB25_4;
 
-BB27_1:
+BB25_1:
 	mov.f64 	%fd1, %fd77;
 	cvta.to.global.u64 	%rd4, %rd2;
 	mul.wide.u32 	%rd5, %r32, 8;
@@ -3821,23 +3528,23 @@ BB27_1:
 	max.f64 	%fd78, %fd1, %fd30;
 	add.s32 	%r3, %r32, %r9;
 	setp.ge.u32	%p2, %r3, %r5;
-	@%p2 bra 	BB27_3;
+	@%p2 bra 	BB25_3;
 
 	mul.wide.u32 	%rd8, %r3, 8;
 	add.s64 	%rd9, %rd4, %rd8;
 	ld.global.f64 	%fd31, [%rd9];
 	max.f64 	%fd78, %fd78, %fd31;
 
-BB27_3:
+BB25_3:
 	mov.f64 	%fd77, %fd78;
 	shl.b32 	%r12, %r9, 1;
 	mov.u32 	%r13, %nctaid.x;
 	mad.lo.s32 	%r32, %r12, %r13, %r32;
 	setp.lt.u32	%p3, %r32, %r5;
 	mov.f64 	%fd76, %fd77;
-	@%p3 bra 	BB27_1;
+	@%p3 bra 	BB25_1;
 
-BB27_4:
+BB25_4:
 	mov.f64 	%fd74, %fd76;
 	mul.wide.u32 	%rd10, %r6, 8;
 	mov.u64 	%rd11, sdata;
@@ -3845,130 +3552,130 @@ BB27_4:
 	st.shared.f64 	[%rd1], %fd74;
 	bar.sync 	0;
 	setp.lt.u32	%p4, %r9, 1024;
-	@%p4 bra 	BB27_8;
+	@%p4 bra 	BB25_8;
 
 	setp.gt.u32	%p5, %r6, 511;
 	mov.f64 	%fd75, %fd74;
-	@%p5 bra 	BB27_7;
+	@%p5 bra 	BB25_7;
 
 	ld.shared.f64 	%fd32, [%rd1+4096];
 	max.f64 	%fd75, %fd74, %fd32;
 	st.shared.f64 	[%rd1], %fd75;
 
-BB27_7:
+BB25_7:
 	mov.f64 	%fd74, %fd75;
 	bar.sync 	0;
 
-BB27_8:
+BB25_8:
 	mov.f64 	%fd72, %fd74;
 	setp.lt.u32	%p6, %r9, 512;
-	@%p6 bra 	BB27_12;
+	@%p6 bra 	BB25_12;
 
 	setp.gt.u32	%p7, %r6, 255;
 	mov.f64 	%fd73, %fd72;
-	@%p7 bra 	BB27_11;
+	@%p7 bra 	BB25_11;
 
 	ld.shared.f64 	%fd33, [%rd1+2048];
 	max.f64 	%fd73, %fd72, %fd33;
 	st.shared.f64 	[%rd1], %fd73;
 
-BB27_11:
+BB25_11:
 	mov.f64 	%fd72, %fd73;
 	bar.sync 	0;
 
-BB27_12:
+BB25_12:
 	mov.f64 	%fd70, %fd72;
 	setp.lt.u32	%p8, %r9, 256;
-	@%p8 bra 	BB27_16;
+	@%p8 bra 	BB25_16;
 
 	setp.gt.u32	%p9, %r6, 127;
 	mov.f64 	%fd71, %fd70;
-	@%p9 bra 	BB27_15;
+	@%p9 bra 	BB25_15;
 
 	ld.shared.f64 	%fd34, [%rd1+1024];
 	max.f64 	%fd71, %fd70, %fd34;
 	st.shared.f64 	[%rd1], %fd71;
 
-BB27_15:
+BB25_15:
 	mov.f64 	%fd70, %fd71;
 	bar.sync 	0;
 
-BB27_16:
+BB25_16:
 	mov.f64 	%fd68, %fd70;
 	setp.lt.u32	%p10, %r9, 128;
-	@%p10 bra 	BB27_20;
+	@%p10 bra 	BB25_20;
 
 	setp.gt.u32	%p11, %r6, 63;
 	mov.f64 	%fd69, %fd68;
-	@%p11 bra 	BB27_19;
+	@%p11 bra 	BB25_19;
 
 	ld.shared.f64 	%fd35, [%rd1+512];
 	max.f64 	%fd69, %fd68, %fd35;
 	st.shared.f64 	[%rd1], %fd69;
 
-BB27_19:
+BB25_19:
 	mov.f64 	%fd68, %fd69;
 	bar.sync 	0;
 
-BB27_20:
+BB25_20:
 	mov.f64 	%fd67, %fd68;
 	setp.gt.u32	%p12, %r6, 31;
-	@%p12 bra 	BB27_33;
+	@%p12 bra 	BB25_33;
 
 	setp.lt.u32	%p13, %r9, 64;
-	@%p13 bra 	BB27_23;
+	@%p13 bra 	BB25_23;
 
 	ld.volatile.shared.f64 	%fd36, [%rd1+256];
 	max.f64 	%fd67, %fd67, %fd36;
 	st.volatile.shared.f64 	[%rd1], %fd67;
 
-BB27_23:
+BB25_23:
 	mov.f64 	%fd66, %fd67;
 	setp.lt.u32	%p14, %r9, 32;
-	@%p14 bra 	BB27_25;
+	@%p14 bra 	BB25_25;
 
 	ld.volatile.shared.f64 	%fd37, [%rd1+128];
 	max.f64 	%fd66, %fd66, %fd37;
 	st.volatile.shared.f64 	[%rd1], %fd66;
 
-BB27_25:
+BB25_25:
 	mov.f64 	%fd65, %fd66;
 	setp.lt.u32	%p15, %r9, 16;
-	@%p15 bra 	BB27_27;
+	@%p15 bra 	BB25_27;
 
 	ld.volatile.shared.f64 	%fd38, [%rd1+64];
 	max.f64 	%fd65, %fd65, %fd38;
 	st.volatile.shared.f64 	[%rd1], %fd65;
 
-BB27_27:
+BB25_27:
 	mov.f64 	%fd64, %fd65;
 	setp.lt.u32	%p16, %r9, 8;
-	@%p16 bra 	BB27_29;
+	@%p16 bra 	BB25_29;
 
 	ld.volatile.shared.f64 	%fd39, [%rd1+32];
 	max.f64 	%fd64, %fd64, %fd39;
 	st.volatile.shared.f64 	[%rd1], %fd64;
 
-BB27_29:
+BB25_29:
 	mov.f64 	%fd63, %fd64;
 	setp.lt.u32	%p17, %r9, 4;
-	@%p17 bra 	BB27_31;
+	@%p17 bra 	BB25_31;
 
 	ld.volatile.shared.f64 	%fd40, [%rd1+16];
 	max.f64 	%fd63, %fd63, %fd40;
 	st.volatile.shared.f64 	[%rd1], %fd63;
 
-BB27_31:
+BB25_31:
 	setp.lt.u32	%p18, %r9, 2;
-	@%p18 bra 	BB27_33;
+	@%p18 bra 	BB25_33;
 
 	ld.volatile.shared.f64 	%fd41, [%rd1+8];
 	max.f64 	%fd42, %fd63, %fd41;
 	st.volatile.shared.f64 	[%rd1], %fd42;
 
-BB27_33:
+BB25_33:
 	setp.ne.s32	%p19, %r6, 0;
-	@%p19 bra 	BB27_35;
+	@%p19 bra 	BB25_35;
 
 	ld.shared.f64 	%fd43, [sdata];
 	cvta.to.global.u64 	%rd12, %rd3;
@@ -3976,7 +3683,7 @@ BB27_33:
 	add.s64 	%rd14, %rd12, %rd13;
 	st.global.f64 	[%rd14], %fd43;
 
-BB27_35:
+BB25_35:
 	ret;
 }
 
@@ -4

<TRUNCATED>