You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by de...@apache.org on 2016/11/16 22:41:24 UTC
[3/5] incubator-systemml git commit: [SYSTEMML-446] Exploit cublas libraries for transpose and certain cases of binary operations + add support for invoking custom kernels

[SYSTEMML-446] Exploit cublas libraries for transpose and certain cases of binary operations + add support for invoking custom kernels

1. Tanuj surveyed the CuBLAS libraries and implemented LibMatrixCUDA
methods. These methods map SystemML operations to CuBLAS calls similar
to LibMatrixCUDA.matmult(). Here are the list of methods: transpose,
cellwiseMatMatAddSub, matScalarElementwiseMultDiv and vectorScalarMult.
In subsequent PR, we will implement remaining binary operations using
custom kernels (i.e. phase 2).

2. Also, added initial support for invoking custom kernels in
preparation for phase 2. This relies of custom kernels compiled into ptx
format and included as resource file. Since ptx format are
target-independent, the kernels can be supported for different GPU
versions. The utility
org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels
simplifies the launching of the kernels. For example: to launch a kernel
copyUpperToLowerTriangleDense<<1,1,32,32>>(jcudaDenseMatrixPtr, dim,
dim*dim), the user has to call:
kernels.launchKernel("copyUpperToLowerTriangleDense", new
ExecutionConfig(1,1,32,32), jcudaDenseMatrixPtr, dim, dim*dim).

3. Also fixed a GPU bufferpool bug (setting of dirty flag).

4. Implemented sparseToDense() and denseToSparse() methods in
JCudaObject similar to MatrixBlock.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/bfc0e0dc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/bfc0e0dc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/bfc0e0dc

Branch: refs/heads/master
Commit: bfc0e0dc772d477bb29d0915c7e793f2e3361a22
Parents: c94e86c
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Mon Oct 31 19:46:36 2016 -0700
Committer: Deron Eriksson <de...@us.ibm.com>
Committed: Wed Nov 16 14:37:36 2016 -0800

----------------------------------------------------------------------
 pom.xml                                         |   13 +-
 src/main/cpp/kernels/SystemML.cu                |  157 ++
 src/main/cpp/kernels/SystemML.ptx               | 1664 ++++++++++++++++++
 .../java/org/apache/sysml/conf/DMLConfig.java   |   11 +
 .../java/org/apache/sysml/hops/AggBinaryOp.java |    4 +-
 .../java/org/apache/sysml/hops/BinaryOp.java    |   11 +-
 .../java/org/apache/sysml/hops/ReorgOp.java     |    6 +-
 .../context/ExecutionContext.java               |    7 +-
 .../controlprogram/parfor/ProgramConverter.java |    4 +-
 .../instructions/GPUInstructionParser.java      |   24 +
 .../gpu/ArithmeticBinaryGPUInstruction.java     |   68 +
 .../instructions/gpu/GPUInstruction.java        |    2 +-
 .../instructions/gpu/MMTSJGPUInstruction.java   |    5 +-
 .../MatrixMatrixArithmeticGPUInstruction.java   |   65 +
 .../instructions/gpu/ReorgGPUInstruction.java   |   90 +
 .../ScalarMatrixArithmeticGPUInstruction.java   |   72 +
 .../gpu/context/ExecutionConfig.java            |  137 ++
 .../instructions/gpu/context/GPUContext.java    |    5 +-
 .../instructions/gpu/context/JCudaContext.java  |   15 +-
 .../instructions/gpu/context/JCudaKernels.java  |  233 +++
 .../instructions/gpu/context/JCudaObject.java   |  293 ++-
 .../runtime/matrix/data/LibMatrixCUDA.java      |  482 ++++-
 .../InstallDependencyForIntegrationTests.java   |   69 +-
 .../test/integration/AutomatedTestBase.java     |    1 +
 .../org/apache/sysml/test/utils/TestUtils.java  |    4 +
 25 files changed, 3330 insertions(+), 112 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index ab059c1..3f3fbad 100644
--- a/pom.xml
+++ b/pom.xml
@@ -70,6 +70,7 @@
 		<scala.binary.version>2.10</scala.binary.version>
 		<scala.test.version>2.2.6</scala.test.version>
 		<maven.build.timestamp.format>yyyy-MM-dd HH:mm:ss z</maven.build.timestamp.format>
+		<enableGPU>false</enableGPU>
 		<!-- OS-specific JVM arguments for running integration tests -->
 		<integrationTestExtraJVMArgs />
 	</properties>
@@ -109,6 +110,13 @@
 			</excludes>
 			<targetPath>scripts</targetPath>
 		</resource>
+		<resource>
+			<directory>src/main/cpp/kernels</directory>
+			<excludes>
+				<exclude>*.cu</exclude>
+			</excludes>
+			<targetPath>kernels</targetPath>
+		</resource>
 	</resources>
 
 		<plugins>
@@ -119,14 +127,15 @@
 				<version>1.1.1</version>
 				<executions>
 					<execution>
-						<phase>pre-integration-test</phase>
+						<phase>prepare-package</phase>
 						<goals>
 							<goal>java</goal>
 						</goals>
 						<configuration>
 							<mainClass>org.apache.sysml.utils.InstallDependencyForIntegrationTests</mainClass>
 							<arguments>
-								<argument>${skipTests}</argument>
+								<argument>true</argument> <!-- <argument>${skipTests}</argument> -->
+								<argument>${enableGPU}</argument>
 							</arguments>
 						</configuration>
 					</execution>

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
new file mode 100644
index 0000000..e91e00a
--- /dev/null
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ 
+/**********************************
+When updating a kernel or adding a new one, 
+please compile the ptx file and commit it:
+nvcc -ptx SystemML.cu 
+***********************************/
+
+// dim => rlen (Assumption: rlen == clen)
+// N = length of dense array
+extern "C"
+__global__ void copyUpperToLowerTriangleDense(double* ret, int dim, int N) {
+	int ix = blockIdx.x * blockDim.x + threadIdx.x;
+	int iy = blockIdx.y * blockDim.y + threadIdx.y;
+	int id_dest = iy * dim + ix;
+	if(iy > ix && id_dest < N) {
+		// TODO: Potential to reduce the number of threads by half
+		int id_src = ix * dim + iy;
+		ret[id_dest] = ret[id_src];
+	}
+}
+
+extern "C"
+__device__ double getBoolean(int val) {
+	if(val == 0)
+		return 0.0;
+	else
+		return 1.0;
+}
+
+// op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power, 
+// 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal, 
+// 11=min, 12=max, 13=and, 14=or, 15=log}
+extern "C"
+__device__ double binaryOp(double x, double y, int op) {
+	// 0=plus, 1=minus, 2=multiply, 3=divide, 4=power
+	if(op == 0)
+		return x + y;
+	else if(op == 1)
+		return x - y;
+	else if(op == 2)
+		return x * y;
+	else if(op == 3)
+		return x / y;
+	else if(op == 4)
+		return pow(x, y);
+	// 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,	
+	else if(op == 5) 
+		return getBoolean(x < y);
+	else if(op == 6)
+		return getBoolean(x <= y);
+	else if(op == 7)
+		return getBoolean(x > y);
+	else if(op == 8)
+		return getBoolean(x >= y);
+	else if(op == 9)
+		return getBoolean(x == y);
+	else if(op == 10)
+		return getBoolean(x != y);
+	// 11=min, 12=max, 13=and, 14=or, 15=log
+	else if(op == 11) {
+		return min(x, y);
+	}
+	else if(op == 12) {
+		return max(x, y);
+	}
+	return -999;
+}
+
+extern "C"
+__global__ void dense_matrix_set(double* A,  double scalar, int rlen, int clen) {
+	int ix = blockIdx.x * blockDim.x + threadIdx.x;
+	int iy = blockIdx.y * blockDim.y + threadIdx.y;
+	int index = ix * clen + iy;
+	if(index < rlen*clen) {
+		A[index] = scalar;
+	}	
+}
+
+extern "C"
+__global__ void dense_matrix_copy(double* A,  double* ret, int rlen, int clen) {
+	int ix = blockIdx.x * blockDim.x + threadIdx.x;
+	int iy = blockIdx.y * blockDim.y + threadIdx.y;
+	int index = ix * clen + iy;
+	if(ix < rlen && iy < clen) {
+		ret[index] = A[index];
+	}
+}
+
+// Compares the value and set
+extern "C"
+__global__ void compareAndSet(double* A,  double* ret, int rlen, int clen, double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) {
+	int ix = blockIdx.x * blockDim.x + threadIdx.x;
+	int iy = blockIdx.y * blockDim.y + threadIdx.y;
+	int index = ix * clen + iy;
+	if(ix < rlen && iy < clen) {
+		if(abs(A[index]-compareVal) < tol)
+			ret[index] = ifEqualsVal;
+		else if(A[index] < compareVal)
+			ret[index] = ifLessThanVal;
+		else		
+			ret[index] = ifGreaterThanVal;
+	}
+}
+
+extern "C"
+__global__ void binCellOp(double* A, double* B, double* C, 
+	int maxRlen, int maxClen, int vectorAStatus, int vectorBStatus, int op) {
+	int ix = blockIdx.x * blockDim.x + threadIdx.x;
+	int iy = blockIdx.y * blockDim.y + threadIdx.y;
+	
+	if(ix < maxRlen && iy < maxClen) {
+		int outIndex = ix * maxClen + iy;
+		int aIndex = outIndex;
+		int bIndex = outIndex;
+		if(vectorAStatus == 1)
+			aIndex = ix; // clen == 1
+		else if(vectorAStatus == 2)
+			aIndex = iy; // rlen == 1
+		if(vectorBStatus == 1)
+			bIndex = ix; // clen == 1
+		else if(vectorBStatus == 2)
+			bIndex = iy; // rlen == 1
+		C[outIndex] = binaryOp(A[aIndex], B[bIndex], op);
+		// printf("C[%d] = A[%d](%f) B[%d](%f) (%d %d)\n", outIndex, aIndex, A[aIndex], bIndex,  B[bIndex], (ix+1), (iy+1));
+	}
+}
+
+extern "C"
+__global__ void binCellScalarOp(double* A, double scalar, double* C, int rlenA, int clenA, int op, int isLeftScalar) {
+	int ix = blockIdx.x * blockDim.x + threadIdx.x;
+	int iy = blockIdx.y * blockDim.y + threadIdx.y;
+	int index = ix * clenA + iy;
+	if(index < rlenA*clenA) {
+		if(isLeftScalar)
+			C[index] = binaryOp(scalar, A[index], op);
+		else
+			C[index] = binaryOp(A[index], scalar, op);
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
new file mode 100644
index 0000000..493f78e
--- /dev/null
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -0,0 +1,1664 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-19805474
+// Cuda compilation tools, release 7.5, V7.5.16
+// Based on LLVM 3.4svn
+//
+
+.version 4.3
+.target sm_20
+.address_size 64
+
+	// .globl	getBoolean
+.func  (.param .b64 func_retval0) __internal_accurate_pow
+(
+	.param .b64 __internal_accurate_pow_param_0,
+	.param .b64 __internal_accurate_pow_param_1
+)
+;
+
+.visible .func  (.param .b64 func_retval0) getBoolean(
+	.param .b32 getBoolean_param_0
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .b32 	%r<2>;
+	.reg .f64 	%fd<2>;
+
+
+	ld.param.u32 	%r1, [getBoolean_param_0];
+	setp.eq.s32	%p1, %r1, 0;
+	selp.f64	%fd1, 0d0000000000000000, 0d3FF0000000000000, %p1;
+	st.param.f64	[func_retval0+0], %fd1;
+	ret;
+}
+
+	// .globl	binaryOp
+.visible .func  (.param .b64 func_retval0) binaryOp(
+	.param .b64 binaryOp_param_0,
+	.param .b64 binaryOp_param_1,
+	.param .b32 binaryOp_param_2
+)
+{
+	.reg .pred 	%p<39>;
+	.reg .b32 	%r<26>;
+	.reg .f64 	%fd<39>;
+	.reg .b64 	%rd<3>;
+
+
+	ld.param.f64 	%fd27, [binaryOp_param_0];
+	ld.param.f64 	%fd28, [binaryOp_param_1];
+	ld.param.u32 	%r3, [binaryOp_param_2];
+	setp.eq.s32	%p2, %r3, 0;
+	@%p2 bra 	BB1_38;
+
+	setp.eq.s32	%p3, %r3, 1;
+	@%p3 bra 	BB1_37;
+	bra.uni 	BB1_2;
+
+BB1_37:
+	sub.f64 	%fd38, %fd27, %fd28;
+	bra.uni 	BB1_39;
+
+BB1_38:
+	add.f64 	%fd38, %fd27, %fd28;
+	bra.uni 	BB1_39;
+
+BB1_2:
+	setp.eq.s32	%p4, %r3, 2;
+	@%p4 bra 	BB1_36;
+	bra.uni 	BB1_3;
+
+BB1_36:
+	mul.f64 	%fd38, %fd27, %fd28;
+	bra.uni 	BB1_39;
+
+BB1_3:
+	setp.eq.s32	%p5, %r3, 3;
+	@%p5 bra 	BB1_35;
+	bra.uni 	BB1_4;
+
+BB1_35:
+	div.rn.f64 	%fd38, %fd27, %fd28;
+	bra.uni 	BB1_39;
+
+BB1_4:
+	setp.eq.s32	%p6, %r3, 4;
+	@%p6 bra 	BB1_21;
+	bra.uni 	BB1_5;
+
+BB1_21:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r1}, %fd27;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r2}, %fd28;
+	}
+	bfe.u32 	%r4, %r2, 20, 11;
+	add.s32 	%r5, %r4, -1012;
+	mov.b64 	 %rd2, %fd28;
+	shl.b64 	%rd1, %rd2, %r5;
+	setp.eq.s64	%p21, %rd1, -9223372036854775808;
+	abs.f64 	%fd9, %fd27;
+	// Callseq Start 0
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.f64	[param0+0], %fd9;
+	.param .b64 param1;
+	st.param.f64	[param1+0], %fd28;
+	.param .b64 retval0;
+	call.uni (retval0), 
+	__internal_accurate_pow, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.f64	%fd37, [retval0+0];
+	
+	//{
+	}// Callseq End 0
+	setp.lt.s32	%p22, %r1, 0;
+	and.pred  	%p1, %p22, %p21;
+	@!%p1 bra 	BB1_23;
+	bra.uni 	BB1_22;
+
+BB1_22:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r6}, %fd37;
+	}
+	xor.b32  	%r7, %r6, -2147483648;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r8, %temp}, %fd37;
+	}
+	mov.b64 	%fd37, {%r8, %r7};
+
+BB1_23:
+	mov.f64 	%fd36, %fd37;
+	setp.eq.f64	%p23, %fd27, 0d0000000000000000;
+	@%p23 bra 	BB1_26;
+	bra.uni 	BB1_24;
+
+BB1_26:
+	selp.b32	%r9, %r1, 0, %p21;
+	or.b32  	%r10, %r9, 2146435072;
+	setp.lt.s32	%p27, %r2, 0;
+	selp.b32	%r11, %r10, %r9, %p27;
+	mov.u32 	%r12, 0;
+	mov.b64 	%fd36, {%r12, %r11};
+	bra.uni 	BB1_27;
+
+BB1_5:
+	setp.eq.s32	%p7, %r3, 5;
+	@%p7 bra 	BB1_20;
+	bra.uni 	BB1_6;
+
+BB1_20:
+	setp.lt.f64	%p20, %fd27, %fd28;
+	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p20;
+	bra.uni 	BB1_39;
+
+BB1_6:
+	setp.eq.s32	%p8, %r3, 6;
+	@%p8 bra 	BB1_19;
+	bra.uni 	BB1_7;
+
+BB1_19:
+	setp.le.f64	%p19, %fd27, %fd28;
+	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p19;
+	bra.uni 	BB1_39;
+
+BB1_24:
+	setp.gt.s32	%p24, %r1, -1;
+	@%p24 bra 	BB1_27;
+
+	cvt.rzi.f64.f64	%fd30, %fd28;
+	setp.neu.f64	%p25, %fd30, %fd28;
+	selp.f64	%fd36, 0dFFF8000000000000, %fd36, %p25;
+
+BB1_27:
+	mov.f64 	%fd15, %fd36;
+	add.f64 	%fd16, %fd27, %fd28;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r13}, %fd16;
+	}
+	and.b32  	%r14, %r13, 2146435072;
+	setp.ne.s32	%p28, %r14, 2146435072;
+	mov.f64 	%fd35, %fd15;
+	@%p28 bra 	BB1_34;
+
+	setp.gtu.f64	%p29, %fd9, 0d7FF0000000000000;
+	mov.f64 	%fd35, %fd16;
+	@%p29 bra 	BB1_34;
+
+	abs.f64 	%fd17, %fd28;
+	setp.gtu.f64	%p30, %fd17, 0d7FF0000000000000;
+	mov.f64 	%fd34, %fd16;
+	mov.f64 	%fd35, %fd34;
+	@%p30 bra 	BB1_34;
+
+	setp.eq.f64	%p31, %fd17, 0d7FF0000000000000;
+	@%p31 bra 	BB1_33;
+	bra.uni 	BB1_31;
+
+BB1_33:
+	setp.gt.f64	%p33, %fd9, 0d3FF0000000000000;
+	selp.b32	%r21, 2146435072, 0, %p33;
+	xor.b32  	%r22, %r21, 2146435072;
+	setp.lt.s32	%p34, %r2, 0;
+	selp.b32	%r23, %r22, %r21, %p34;
+	setp.eq.f64	%p35, %fd27, 0dBFF0000000000000;
+	selp.b32	%r24, 1072693248, %r23, %p35;
+	mov.u32 	%r25, 0;
+	mov.b64 	%fd35, {%r25, %r24};
+	bra.uni 	BB1_34;
+
+BB1_7:
+	setp.eq.s32	%p9, %r3, 7;
+	@%p9 bra 	BB1_18;
+	bra.uni 	BB1_8;
+
+BB1_18:
+	setp.gt.f64	%p18, %fd27, %fd28;
+	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p18;
+	bra.uni 	BB1_39;
+
+BB1_8:
+	setp.eq.s32	%p10, %r3, 8;
+	@%p10 bra 	BB1_17;
+	bra.uni 	BB1_9;
+
+BB1_17:
+	setp.ge.f64	%p17, %fd27, %fd28;
+	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p17;
+	bra.uni 	BB1_39;
+
+BB1_9:
+	setp.eq.s32	%p11, %r3, 9;
+	@%p11 bra 	BB1_16;
+	bra.uni 	BB1_10;
+
+BB1_16:
+	setp.eq.f64	%p16, %fd27, %fd28;
+	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p16;
+	bra.uni 	BB1_39;
+
+BB1_31:
+	setp.neu.f64	%p32, %fd9, 0d7FF0000000000000;
+	mov.f64 	%fd35, %fd15;
+	@%p32 bra 	BB1_34;
+
+	shr.s32 	%r15, %r2, 31;
+	and.b32  	%r16, %r15, -2146435072;
+	add.s32 	%r17, %r16, 2146435072;
+	or.b32  	%r18, %r17, -2147483648;
+	selp.b32	%r19, %r18, %r17, %p1;
+	mov.u32 	%r20, 0;
+	mov.b64 	%fd35, {%r20, %r19};
+
+BB1_34:
+	setp.eq.f64	%p36, %fd28, 0d0000000000000000;
+	setp.eq.f64	%p37, %fd27, 0d3FF0000000000000;
+	or.pred  	%p38, %p37, %p36;
+	selp.f64	%fd38, 0d3FF0000000000000, %fd35, %p38;
+
+BB1_39:
+	st.param.f64	[func_retval0+0], %fd38;
+	ret;
+
+BB1_10:
+	setp.eq.s32	%p12, %r3, 10;
+	@%p12 bra 	BB1_15;
+	bra.uni 	BB1_11;
+
+BB1_15:
+	setp.neu.f64	%p15, %fd27, %fd28;
+	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p15;
+	bra.uni 	BB1_39;
+
+BB1_11:
+	setp.eq.s32	%p13, %r3, 11;
+	@%p13 bra 	BB1_14;
+	bra.uni 	BB1_12;
+
+BB1_14:
+	min.f64 	%fd38, %fd27, %fd28;
+	bra.uni 	BB1_39;
+
+BB1_12:
+	mov.f64 	%fd38, 0dC08F380000000000;
+	setp.ne.s32	%p14, %r3, 12;
+	@%p14 bra 	BB1_39;
+
+	max.f64 	%fd38, %fd27, %fd28;
+	bra.uni 	BB1_39;
+}
+
+	// .globl	copyUpperToLowerTriangleDense
+.visible .entry copyUpperToLowerTriangleDense(
+	.param .u64 copyUpperToLowerTriangleDense_param_0,
+	.param .u32 copyUpperToLowerTriangleDense_param_1,
+	.param .u32 copyUpperToLowerTriangleDense_param_2
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<13>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<7>;
+
+
+	ld.param.u64 	%rd1, [copyUpperToLowerTriangleDense_param_0];
+	ld.param.u32 	%r4, [copyUpperToLowerTriangleDense_param_1];
+	ld.param.u32 	%r5, [copyUpperToLowerTriangleDense_param_2];
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %ctaid.x;
+	mov.u32 	%r8, %tid.x;
+	mad.lo.s32 	%r1, %r6, %r7, %r8;
+	mov.u32 	%r9, %ntid.y;
+	mov.u32 	%r10, %ctaid.y;
+	mov.u32 	%r11, %tid.y;
+	mad.lo.s32 	%r2, %r9, %r10, %r11;
+	mad.lo.s32 	%r3, %r2, %r4, %r1;
+	setp.gt.s32	%p1, %r2, %r1;
+	setp.lt.s32	%p2, %r3, %r5;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB2_2;
+	bra.uni 	BB2_1;
+
+BB2_1:
+	cvta.to.global.u64 	%rd2, %rd1;
+	mad.lo.s32 	%r12, %r1, %r4, %r2;
+	mul.wide.s32 	%rd3, %r12, 8;
+	add.s64 	%rd4, %rd2, %rd3;
+	ld.global.f64 	%fd1, [%rd4];
+	mul.wide.s32 	%rd5, %r3, 8;
+	add.s64 	%rd6, %rd2, %rd5;
+	st.global.f64 	[%rd6], %fd1;
+
+BB2_2:
+	ret;
+}
+
+	// .globl	dense_matrix_set
+.visible .entry dense_matrix_set(
+	.param .u64 dense_matrix_set_param_0,
+	.param .f64 dense_matrix_set_param_1,
+	.param .u32 dense_matrix_set_param_2,
+	.param .u32 dense_matrix_set_param_3
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .b32 	%r<13>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<5>;
+
+
+	ld.param.u64 	%rd1, [dense_matrix_set_param_0];
+	ld.param.f64 	%fd1, [dense_matrix_set_param_1];
+	ld.param.u32 	%r2, [dense_matrix_set_param_2];
+	ld.param.u32 	%r3, [dense_matrix_set_param_3];
+	mov.u32 	%r4, %ctaid.x;
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r7, %r5, %r4, %r6;
+	mov.u32 	%r8, %ntid.y;
+	mov.u32 	%r9, %ctaid.y;
+	mov.u32 	%r10, %tid.y;
+	mad.lo.s32 	%r11, %r7, %r3, %r10;
+	mad.lo.s32 	%r1, %r8, %r9, %r11;
+	mul.lo.s32 	%r12, %r3, %r2;
+	setp.ge.s32	%p1, %r1, %r12;
+	@%p1 bra 	BB3_2;
+
+	cvta.to.global.u64 	%rd2, %rd1;
+	mul.wide.s32 	%rd3, %r1, 8;
+	add.s64 	%rd4, %rd2, %rd3;
+	st.global.f64 	[%rd4], %fd1;
+
+BB3_2:
+	ret;
+}
+
+	// .globl	dense_matrix_copy
+.visible .entry dense_matrix_copy(
+	.param .u64 dense_matrix_copy_param_0,
+	.param .u64 dense_matrix_copy_param_1,
+	.param .u32 dense_matrix_copy_param_2,
+	.param .u32 dense_matrix_copy_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<12>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [dense_matrix_copy_param_0];
+	ld.param.u64 	%rd2, [dense_matrix_copy_param_1];
+	ld.param.u32 	%r2, [dense_matrix_copy_param_2];
+	ld.param.u32 	%r3, [dense_matrix_copy_param_3];
+	mov.u32 	%r4, %ctaid.x;
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r7, %r5, %r4, %r6;
+	mov.u32 	%r8, %ntid.y;
+	mov.u32 	%r9, %ctaid.y;
+	mov.u32 	%r10, %tid.y;
+	mad.lo.s32 	%r11, %r8, %r9, %r10;
+	mad.lo.s32 	%r1, %r7, %r3, %r11;
+	setp.lt.s32	%p1, %r7, %r2;
+	setp.lt.s32	%p2, %r11, %r3;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB4_2;
+	bra.uni 	BB4_1;
+
+BB4_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd1, [%rd5];
+	cvta.to.global.u64 	%rd6, %rd2;
+	add.s64 	%rd7, %rd6, %rd4;
+	st.global.f64 	[%rd7], %fd1;
+
+BB4_2:
+	ret;
+}
+
+	// .globl	compareAndSet
+.visible .entry compareAndSet(
+	.param .u64 compareAndSet_param_0,
+	.param .u64 compareAndSet_param_1,
+	.param .u32 compareAndSet_param_2,
+	.param .u32 compareAndSet_param_3,
+	.param .f64 compareAndSet_param_4,
+	.param .f64 compareAndSet_param_5,
+	.param .f64 compareAndSet_param_6,
+	.param .f64 compareAndSet_param_7,
+	.param .f64 compareAndSet_param_8
+)
+{
+	.reg .pred 	%p<6>;
+	.reg .b32 	%r<12>;
+	.reg .f64 	%fd<9>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd2, [compareAndSet_param_0];
+	ld.param.u64 	%rd3, [compareAndSet_param_1];
+	ld.param.u32 	%r2, [compareAndSet_param_2];
+	ld.param.u32 	%r3, [compareAndSet_param_3];
+	ld.param.f64 	%fd2, [compareAndSet_param_4];
+	ld.param.f64 	%fd3, [compareAndSet_param_5];
+	ld.param.f64 	%fd4, [compareAndSet_param_6];
+	ld.param.f64 	%fd5, [compareAndSet_param_7];
+	ld.param.f64 	%fd6, [compareAndSet_param_8];
+	mov.u32 	%r4, %ctaid.x;
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r7, %r5, %r4, %r6;
+	mov.u32 	%r8, %ntid.y;
+	mov.u32 	%r9, %ctaid.y;
+	mov.u32 	%r10, %tid.y;
+	mad.lo.s32 	%r11, %r8, %r9, %r10;
+	mad.lo.s32 	%r1, %r7, %r3, %r11;
+	setp.lt.s32	%p1, %r7, %r2;
+	setp.lt.s32	%p2, %r11, %r3;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB5_6;
+	bra.uni 	BB5_1;
+
+BB5_1:
+	cvta.to.global.u64 	%rd4, %rd2;
+	mul.wide.s32 	%rd5, %r1, 8;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f64 	%fd1, [%rd6];
+	sub.f64 	%fd7, %fd1, %fd2;
+	abs.f64 	%fd8, %fd7;
+	setp.lt.f64	%p4, %fd8, %fd3;
+	cvta.to.global.u64 	%rd7, %rd3;
+	add.s64 	%rd1, %rd7, %rd5;
+	@%p4 bra 	BB5_5;
+	bra.uni 	BB5_2;
+
+BB5_5:
+	st.global.f64 	[%rd1], %fd4;
+	bra.uni 	BB5_6;
+
+BB5_2:
+	setp.lt.f64	%p5, %fd1, %fd2;
+	@%p5 bra 	BB5_4;
+	bra.uni 	BB5_3;
+
+BB5_4:
+	st.global.f64 	[%rd1], %fd5;
+	bra.uni 	BB5_6;
+
+BB5_3:
+	st.global.f64 	[%rd1], %fd6;
+
+BB5_6:
+	ret;
+}
+
+	// .globl	binCellOp
+.visible .entry binCellOp(
+	.param .u64 binCellOp_param_0,
+	.param .u64 binCellOp_param_1,
+	.param .u64 binCellOp_param_2,
+	.param .u32 binCellOp_param_3,
+	.param .u32 binCellOp_param_4,
+	.param .u32 binCellOp_param_5,
+	.param .u32 binCellOp_param_6,
+	.param .u32 binCellOp_param_7
+)
+{
+	.reg .pred 	%p<50>;
+	.reg .b32 	%r<52>;
+	.reg .f64 	%fd<39>;
+	.reg .b64 	%rd<15>;
+
+
+	ld.param.u64 	%rd2, [binCellOp_param_0];
+	ld.param.u64 	%rd3, [binCellOp_param_1];
+	ld.param.u64 	%rd4, [binCellOp_param_2];
+	ld.param.u32 	%r14, [binCellOp_param_3];
+	ld.param.u32 	%r10, [binCellOp_param_4];
+	ld.param.u32 	%r11, [binCellOp_param_5];
+	ld.param.u32 	%r12, [binCellOp_param_6];
+	ld.param.u32 	%r13, [binCellOp_param_7];
+	mov.u32 	%r15, %ntid.x;
+	mov.u32 	%r16, %ctaid.x;
+	mov.u32 	%r17, %tid.x;
+	mad.lo.s32 	%r1, %r15, %r16, %r17;
+	mov.u32 	%r18, %ntid.y;
+	mov.u32 	%r19, %ctaid.y;
+	mov.u32 	%r20, %tid.y;
+	mad.lo.s32 	%r2, %r18, %r19, %r20;
+	setp.lt.s32	%p2, %r1, %r14;
+	setp.lt.s32	%p3, %r2, %r10;
+	and.pred  	%p4, %p2, %p3;
+	@!%p4 bra 	BB6_53;
+	bra.uni 	BB6_1;
+
+BB6_1:
+	mad.lo.s32 	%r3, %r1, %r10, %r2;
+	setp.eq.s32	%p5, %r11, 1;
+	mov.u32 	%r50, %r1;
+	@%p5 bra 	BB6_5;
+
+	setp.ne.s32	%p6, %r11, 2;
+	mov.u32 	%r51, %r3;
+	@%p6 bra 	BB6_4;
+
+	mov.u32 	%r51, %r2;
+
+BB6_4:
+	mov.u32 	%r45, %r51;
+	mov.u32 	%r4, %r45;
+	mov.u32 	%r50, %r4;
+
+BB6_5:
+	mov.u32 	%r5, %r50;
+	setp.eq.s32	%p7, %r12, 1;
+	mov.u32 	%r48, %r1;
+	@%p7 bra 	BB6_9;
+
+	setp.ne.s32	%p8, %r12, 2;
+	mov.u32 	%r49, %r3;
+	@%p8 bra 	BB6_8;
+
+	mov.u32 	%r49, %r2;
+
+BB6_8:
+	mov.u32 	%r48, %r49;
+
+BB6_9:
+	cvta.to.global.u64 	%rd5, %rd3;
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.s32 	%rd7, %r5, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	ld.global.f64 	%fd1, [%rd8];
+	mul.wide.s32 	%rd9, %r48, 8;
+	add.s64 	%rd10, %rd5, %rd9;
+	ld.global.f64 	%fd2, [%rd10];
+	mov.f64 	%fd38, 0dC08F380000000000;
+	setp.gt.s32	%p9, %r13, 5;
+	@%p9 bra 	BB6_19;
+
+	setp.gt.s32	%p19, %r13, 2;
+	@%p19 bra 	BB6_15;
+
+	setp.eq.s32	%p23, %r13, 0;
+	@%p23 bra 	BB6_51;
+
+	setp.eq.s32	%p24, %r13, 1;
+	@%p24 bra 	BB6_50;
+	bra.uni 	BB6_13;
+
+BB6_50:
+	sub.f64 	%fd38, %fd1, %fd2;
+	bra.uni 	BB6_52;
+
+BB6_19:
+	setp.gt.s32	%p10, %r13, 8;
+	@%p10 bra 	BB6_24;
+
+	setp.eq.s32	%p16, %r13, 6;
+	@%p16 bra 	BB6_34;
+
+	setp.eq.s32	%p17, %r13, 7;
+	@%p17 bra 	BB6_33;
+	bra.uni 	BB6_22;
+
+BB6_33:
+	setp.gt.f64	%p29, %fd1, %fd2;
+	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p29;
+	bra.uni 	BB6_52;
+
+BB6_15:
+	setp.eq.s32	%p20, %r13, 3;
+	@%p20 bra 	BB6_49;
+
+	setp.eq.s32	%p21, %r13, 4;
+	@%p21 bra 	BB6_35;
+	bra.uni 	BB6_17;
+
+BB6_35:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r8}, %fd1;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r9}, %fd2;
+	}
+	bfe.u32 	%r21, %r9, 20, 11;
+	add.s32 	%r22, %r21, -1012;
+	mov.b64 	 %rd11, %fd2;
+	shl.b64 	%rd1, %rd11, %r22;
+	setp.eq.s64	%p32, %rd1, -9223372036854775808;
+	abs.f64 	%fd11, %fd1;
+	// Callseq Start 1
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.f64	[param0+0], %fd11;
+	.param .b64 param1;
+	st.param.f64	[param1+0], %fd2;
+	.param .b64 retval0;
+	call.uni (retval0), 
+	__internal_accurate_pow, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.f64	%fd37, [retval0+0];
+	
+	//{
+	}// Callseq End 1
+	setp.lt.s32	%p33, %r8, 0;
+	and.pred  	%p1, %p33, %p32;
+	@!%p1 bra 	BB6_37;
+	bra.uni 	BB6_36;
+
+BB6_36:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r23}, %fd37;
+	}
+	xor.b32  	%r24, %r23, -2147483648;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r25, %temp}, %fd37;
+	}
+	mov.b64 	%fd37, {%r25, %r24};
+
+BB6_37:
+	mov.f64 	%fd36, %fd37;
+	setp.eq.f64	%p34, %fd1, 0d0000000000000000;
+	@%p34 bra 	BB6_40;
+	bra.uni 	BB6_38;
+
+BB6_40:
+	selp.b32	%r26, %r8, 0, %p32;
+	or.b32  	%r27, %r26, 2146435072;
+	setp.lt.s32	%p38, %r9, 0;
+	selp.b32	%r28, %r27, %r26, %p38;
+	mov.u32 	%r29, 0;
+	mov.b64 	%fd36, {%r29, %r28};
+	bra.uni 	BB6_41;
+
+BB6_24:
+	setp.gt.s32	%p11, %r13, 10;
+	@%p11 bra 	BB6_28;
+
+	setp.eq.s32	%p14, %r13, 9;
+	@%p14 bra 	BB6_32;
+	bra.uni 	BB6_26;
+
+BB6_32:
+	setp.eq.f64	%p27, %fd1, %fd2;
+	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p27;
+	bra.uni 	BB6_52;
+
+BB6_28:
+	setp.eq.s32	%p12, %r13, 11;
+	@%p12 bra 	BB6_31;
+	bra.uni 	BB6_29;
+
+BB6_31:
+	min.f64 	%fd38, %fd1, %fd2;
+	bra.uni 	BB6_52;
+
+BB6_51:
+	add.f64 	%fd38, %fd1, %fd2;
+	bra.uni 	BB6_52;
+
+BB6_13:
+	setp.eq.s32	%p25, %r13, 2;
+	@%p25 bra 	BB6_14;
+	bra.uni 	BB6_52;
+
+BB6_14:
+	mul.f64 	%fd38, %fd1, %fd2;
+	bra.uni 	BB6_52;
+
+BB6_34:
+	setp.le.f64	%p30, %fd1, %fd2;
+	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p30;
+	bra.uni 	BB6_52;
+
+BB6_22:
+	setp.eq.s32	%p18, %r13, 8;
+	@%p18 bra 	BB6_23;
+	bra.uni 	BB6_52;
+
+BB6_23:
+	setp.ge.f64	%p28, %fd1, %fd2;
+	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p28;
+	bra.uni 	BB6_52;
+
+BB6_49:
+	div.rn.f64 	%fd38, %fd1, %fd2;
+	bra.uni 	BB6_52;
+
+BB6_17:
+	setp.eq.s32	%p22, %r13, 5;
+	@%p22 bra 	BB6_18;
+	bra.uni 	BB6_52;
+
+BB6_18:
+	setp.lt.f64	%p31, %fd1, %fd2;
+	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p31;
+	bra.uni 	BB6_52;
+
+BB6_26:
+	setp.eq.s32	%p15, %r13, 10;
+	@%p15 bra 	BB6_27;
+	bra.uni 	BB6_52;
+
+BB6_27:
+	setp.neu.f64	%p26, %fd1, %fd2;
+	selp.f64	%fd38, 0d3FF0000000000000, 0d0000000000000000, %p26;
+	bra.uni 	BB6_52;
+
+BB6_29:
+	setp.ne.s32	%p13, %r13, 12;
+	@%p13 bra 	BB6_52;
+
+	max.f64 	%fd38, %fd1, %fd2;
+	bra.uni 	BB6_52;
+
+BB6_38:
+	setp.gt.s32	%p35, %r8, -1;
+	@%p35 bra 	BB6_41;
+
+	cvt.rzi.f64.f64	%fd30, %fd2;
+	setp.neu.f64	%p36, %fd30, %fd2;
+	selp.f64	%fd36, 0dFFF8000000000000, %fd36, %p36;
+
+BB6_41:
+	mov.f64 	%fd17, %fd36;
+	add.f64 	%fd18, %fd1, %fd2;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r30}, %fd18;
+	}
+	and.b32  	%r31, %r30, 2146435072;
+	setp.ne.s32	%p39, %r31, 2146435072;
+	mov.f64 	%fd35, %fd17;
+	@%p39 bra 	BB6_48;
+
+	setp.gtu.f64	%p40, %fd11, 0d7FF0000000000000;
+	mov.f64 	%fd35, %fd18;
+	@%p40 bra 	BB6_48;
+
+	abs.f64 	%fd19, %fd2;
+	setp.gtu.f64	%p41, %fd19, 0d7FF0000000000000;
+	mov.f64 	%fd34, %fd18;
+	mov.f64 	%fd35, %fd34;
+	@%p41 bra 	BB6_48;
+
+	setp.eq.f64	%p42, %fd19, 0d7FF0000000000000;
+	@%p42 bra 	BB6_47;
+	bra.uni 	BB6_45;
+
+BB6_47:
+	setp.gt.f64	%p44, %fd11, 0d3FF0000000000000;
+	selp.b32	%r38, 2146435072, 0, %p44;
+	xor.b32  	%r39, %r38, 2146435072;
+	setp.lt.s32	%p45, %r9, 0;
+	selp.b32	%r40, %r39, %r38, %p45;
+	setp.eq.f64	%p46, %fd1, 0dBFF0000000000000;
+	selp.b32	%r41, 1072693248, %r40, %p46;
+	mov.u32 	%r42, 0;
+	mov.b64 	%fd35, {%r42, %r41};
+	bra.uni 	BB6_48;
+
+BB6_45:
+	setp.neu.f64	%p43, %fd11, 0d7FF0000000000000;
+	mov.f64 	%fd35, %fd17;
+	@%p43 bra 	BB6_48;
+
+	shr.s32 	%r32, %r9, 31;
+	and.b32  	%r33, %r32, -2146435072;
+	add.s32 	%r34, %r33, 2146435072;
+	or.b32  	%r35, %r34, -2147483648;
+	selp.b32	%r36, %r35, %r34, %p1;
+	mov.u32 	%r37, 0;
+	mov.b64 	%fd35, {%r37, %r36};
+
+BB6_48:
+	setp.eq.f64	%p47, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p48, %fd1, 0d3FF0000000000000;
+	or.pred  	%p49, %p48, %p47;
+	selp.f64	%fd38, 0d3FF0000000000000, %fd35, %p49;
+
+BB6_52:
+	cvta.to.global.u64 	%rd12, %rd4;
+	mul.wide.s32 	%rd13, %r3, 8;
+	add.s64 	%rd14, %rd12, %rd13;
+	st.global.f64 	[%rd14], %fd38;
+
+BB6_53:
+	ret;
+}
+
+	// .globl	binCellScalarOp
+.visible .entry binCellScalarOp(
+	.param .u64 binCellScalarOp_param_0,
+	.param .f64 binCellScalarOp_param_1,
+	.param .u64 binCellScalarOp_param_2,
+	.param .u32 binCellScalarOp_param_3,
+	.param .u32 binCellScalarOp_param_4,
+	.param .u32 binCellScalarOp_param_5,
+	.param .u32 binCellScalarOp_param_6
+)
+{
+	.reg .pred 	%p<85>;
+	.reg .b32 	%r<63>;
+	.reg .f64 	%fd<75>;
+	.reg .b64 	%rd<12>;
+
+
+	ld.param.u64 	%rd4, [binCellScalarOp_param_0];
+	ld.param.f64 	%fd54, [binCellScalarOp_param_1];
+	ld.param.u64 	%rd5, [binCellScalarOp_param_2];
+	ld.param.u32 	%r8, [binCellScalarOp_param_3];
+	ld.param.u32 	%r9, [binCellScalarOp_param_4];
+	ld.param.u32 	%r6, [binCellScalarOp_param_5];
+	ld.param.u32 	%r7, [binCellScalarOp_param_6];
+	mov.u32 	%r10, %ctaid.x;
+	mov.u32 	%r11, %ntid.x;
+	mov.u32 	%r12, %tid.x;
+	mad.lo.s32 	%r13, %r11, %r10, %r12;
+	mov.u32 	%r14, %ntid.y;
+	mov.u32 	%r15, %ctaid.y;
+	mov.u32 	%r16, %tid.y;
+	mad.lo.s32 	%r17, %r13, %r9, %r16;
+	mad.lo.s32 	%r1, %r14, %r15, %r17;
+	mul.lo.s32 	%r18, %r9, %r8;
+	setp.ge.s32	%p3, %r1, %r18;
+	@%p3 bra 	BB7_88;
+
+	cvta.to.global.u64 	%rd6, %rd5;
+	cvta.to.global.u64 	%rd7, %rd4;
+	mul.wide.s32 	%rd8, %r1, 8;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f64 	%fd1, [%rd9];
+	add.s64 	%rd1, %rd6, %rd8;
+	setp.eq.s32	%p4, %r7, 0;
+	@%p4 bra 	BB7_45;
+
+	setp.eq.s32	%p5, %r6, 0;
+	@%p5 bra 	BB7_43;
+
+	mov.f64 	%fd66, 0dC08F380000000000;
+	setp.gt.s32	%p6, %r6, 6;
+	@%p6 bra 	BB7_13;
+
+	setp.gt.s32	%p14, %r6, 3;
+	@%p14 bra 	BB7_9;
+
+	setp.eq.s32	%p18, %r6, 1;
+	@%p18 bra 	BB7_42;
+
+	setp.eq.s32	%p19, %r6, 2;
+	@%p19 bra 	BB7_41;
+	bra.uni 	BB7_7;
+
+BB7_41:
+	mul.f64 	%fd66, %fd1, %fd54;
+	bra.uni 	BB7_44;
+
+BB7_45:
+	setp.eq.s32	%p45, %r6, 0;
+	@%p45 bra 	BB7_86;
+
+	mov.f64 	%fd74, 0dC08F380000000000;
+	setp.gt.s32	%p46, %r6, 6;
+	@%p46 bra 	BB7_56;
+
+	setp.gt.s32	%p54, %r6, 3;
+	@%p54 bra 	BB7_52;
+
+	setp.eq.s32	%p58, %r6, 1;
+	@%p58 bra 	BB7_85;
+
+	setp.eq.s32	%p59, %r6, 2;
+	@%p59 bra 	BB7_84;
+	bra.uni 	BB7_50;
+
+BB7_84:
+	mul.f64 	%fd74, %fd1, %fd54;
+	bra.uni 	BB7_87;
+
+BB7_43:
+	add.f64 	%fd66, %fd1, %fd54;
+
+BB7_44:
+	st.global.f64 	[%rd1], %fd66;
+	bra.uni 	BB7_88;
+
+BB7_13:
+	setp.gt.s32	%p7, %r6, 9;
+	@%p7 bra 	BB7_18;
+
+	setp.eq.s32	%p11, %r6, 7;
+	@%p11 bra 	BB7_25;
+
+	setp.eq.s32	%p12, %r6, 8;
+	@%p12 bra 	BB7_24;
+	bra.uni 	BB7_16;
+
+BB7_24:
+	setp.le.f64	%p23, %fd1, %fd54;
+	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p23;
+	bra.uni 	BB7_44;
+
+BB7_86:
+	add.f64 	%fd74, %fd1, %fd54;
+
+BB7_87:
+	st.global.f64 	[%rd1], %fd74;
+
+BB7_88:
+	ret;
+
+BB7_56:
+	setp.gt.s32	%p47, %r6, 9;
+	@%p47 bra 	BB7_61;
+
+	setp.eq.s32	%p51, %r6, 7;
+	@%p51 bra 	BB7_68;
+
+	setp.eq.s32	%p52, %r6, 8;
+	@%p52 bra 	BB7_67;
+	bra.uni 	BB7_59;
+
+BB7_67:
+	setp.ge.f64	%p63, %fd1, %fd54;
+	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p63;
+	bra.uni 	BB7_87;
+
+BB7_9:
+	setp.eq.s32	%p15, %r6, 4;
+	@%p15 bra 	BB7_27;
+
+	setp.eq.s32	%p16, %r6, 5;
+	@%p16 bra 	BB7_26;
+	bra.uni 	BB7_11;
+
+BB7_26:
+	setp.gt.f64	%p26, %fd1, %fd54;
+	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p26;
+	bra.uni 	BB7_44;
+
+BB7_18:
+	setp.eq.s32	%p8, %r6, 10;
+	@%p8 bra 	BB7_23;
+
+	setp.eq.s32	%p9, %r6, 11;
+	@%p9 bra 	BB7_22;
+	bra.uni 	BB7_20;
+
+BB7_22:
+	min.f64 	%fd66, %fd54, %fd1;
+	bra.uni 	BB7_44;
+
+BB7_52:
+	setp.eq.s32	%p55, %r6, 4;
+	@%p55 bra 	BB7_70;
+
+	setp.eq.s32	%p56, %r6, 5;
+	@%p56 bra 	BB7_69;
+	bra.uni 	BB7_54;
+
+BB7_69:
+	setp.lt.f64	%p66, %fd1, %fd54;
+	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p66;
+	bra.uni 	BB7_87;
+
+BB7_61:
+	setp.eq.s32	%p48, %r6, 10;
+	@%p48 bra 	BB7_66;
+
+	setp.eq.s32	%p49, %r6, 11;
+	@%p49 bra 	BB7_65;
+	bra.uni 	BB7_63;
+
+BB7_65:
+	min.f64 	%fd74, %fd1, %fd54;
+	bra.uni 	BB7_87;
+
+BB7_42:
+	sub.f64 	%fd66, %fd54, %fd1;
+	bra.uni 	BB7_44;
+
+BB7_7:
+	setp.eq.s32	%p20, %r6, 3;
+	@%p20 bra 	BB7_8;
+	bra.uni 	BB7_44;
+
+BB7_8:
+	div.rn.f64 	%fd66, %fd54, %fd1;
+	bra.uni 	BB7_44;
+
+BB7_25:
+	setp.lt.f64	%p24, %fd1, %fd54;
+	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p24;
+	bra.uni 	BB7_44;
+
+BB7_16:
+	setp.eq.s32	%p13, %r6, 9;
+	@%p13 bra 	BB7_17;
+	bra.uni 	BB7_44;
+
+BB7_17:
+	setp.eq.f64	%p22, %fd1, %fd54;
+	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p22;
+	bra.uni 	BB7_44;
+
+BB7_27:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r2}, %fd54;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r3}, %fd1;
+	}
+	bfe.u32 	%r19, %r3, 20, 11;
+	add.s32 	%r20, %r19, -1012;
+	mov.b64 	 %rd10, %fd1;
+	shl.b64 	%rd2, %rd10, %r20;
+	setp.eq.s64	%p27, %rd2, -9223372036854775808;
+	abs.f64 	%fd10, %fd54;
+	// Callseq Start 2
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.f64	[param0+0], %fd10;
+	.param .b64 param1;
+	st.param.f64	[param1+0], %fd1;
+	.param .b64 retval0;
+	call.uni (retval0), 
+	__internal_accurate_pow, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.f64	%fd65, [retval0+0];
+	
+	//{
+	}// Callseq End 2
+	setp.lt.s32	%p28, %r2, 0;
+	and.pred  	%p1, %p28, %p27;
+	@!%p1 bra 	BB7_29;
+	bra.uni 	BB7_28;
+
+BB7_28:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r21}, %fd65;
+	}
+	xor.b32  	%r22, %r21, -2147483648;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r23, %temp}, %fd65;
+	}
+	mov.b64 	%fd65, {%r23, %r22};
+
+BB7_29:
+	mov.f64 	%fd64, %fd65;
+	setp.eq.f64	%p29, %fd54, 0d0000000000000000;
+	@%p29 bra 	BB7_32;
+	bra.uni 	BB7_30;
+
+BB7_32:
+	selp.b32	%r24, %r2, 0, %p27;
+	or.b32  	%r25, %r24, 2146435072;
+	setp.lt.s32	%p33, %r3, 0;
+	selp.b32	%r26, %r25, %r24, %p33;
+	mov.u32 	%r27, 0;
+	mov.b64 	%fd64, {%r27, %r26};
+	bra.uni 	BB7_33;
+
+BB7_11:
+	setp.eq.s32	%p17, %r6, 6;
+	@%p17 bra 	BB7_12;
+	bra.uni 	BB7_44;
+
+BB7_12:
+	setp.ge.f64	%p25, %fd1, %fd54;
+	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p25;
+	bra.uni 	BB7_44;
+
+BB7_23:
+	setp.neu.f64	%p21, %fd1, %fd54;
+	selp.f64	%fd66, 0d3FF0000000000000, 0d0000000000000000, %p21;
+	bra.uni 	BB7_44;
+
+BB7_20:
+	setp.ne.s32	%p10, %r6, 12;
+	@%p10 bra 	BB7_44;
+
+	max.f64 	%fd66, %fd54, %fd1;
+	bra.uni 	BB7_44;
+
+BB7_85:
+	sub.f64 	%fd74, %fd1, %fd54;
+	bra.uni 	BB7_87;
+
+BB7_50:
+	setp.eq.s32	%p60, %r6, 3;
+	@%p60 bra 	BB7_51;
+	bra.uni 	BB7_87;
+
+BB7_51:
+	div.rn.f64 	%fd74, %fd1, %fd54;
+	bra.uni 	BB7_87;
+
+BB7_68:
+	setp.gt.f64	%p64, %fd1, %fd54;
+	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p64;
+	bra.uni 	BB7_87;
+
+BB7_59:
+	setp.eq.s32	%p53, %r6, 9;
+	@%p53 bra 	BB7_60;
+	bra.uni 	BB7_87;
+
+BB7_60:
+	setp.eq.f64	%p62, %fd1, %fd54;
+	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p62;
+	bra.uni 	BB7_87;
+
+BB7_70:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r4}, %fd1;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r5}, %fd54;
+	}
+	bfe.u32 	%r41, %r5, 20, 11;
+	add.s32 	%r42, %r41, -1012;
+	mov.b64 	 %rd11, %fd54;
+	shl.b64 	%rd3, %rd11, %r42;
+	setp.eq.s64	%p67, %rd3, -9223372036854775808;
+	abs.f64 	%fd36, %fd1;
+	// Callseq Start 3
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.f64	[param0+0], %fd36;
+	.param .b64 param1;
+	st.param.f64	[param1+0], %fd54;
+	.param .b64 retval0;
+	call.uni (retval0), 
+	__internal_accurate_pow, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.f64	%fd73, [retval0+0];
+	
+	//{
+	}// Callseq End 3
+	setp.lt.s32	%p68, %r4, 0;
+	and.pred  	%p2, %p68, %p67;
+	@!%p2 bra 	BB7_72;
+	bra.uni 	BB7_71;
+
+BB7_71:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r43}, %fd73;
+	}
+	xor.b32  	%r44, %r43, -2147483648;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r45, %temp}, %fd73;
+	}
+	mov.b64 	%fd73, {%r45, %r44};
+
+BB7_72:
+	mov.f64 	%fd72, %fd73;
+	setp.eq.f64	%p69, %fd1, 0d0000000000000000;
+	@%p69 bra 	BB7_75;
+	bra.uni 	BB7_73;
+
+BB7_75:
+	selp.b32	%r46, %r4, 0, %p67;
+	or.b32  	%r47, %r46, 2146435072;
+	setp.lt.s32	%p73, %r5, 0;
+	selp.b32	%r48, %r47, %r46, %p73;
+	mov.u32 	%r49, 0;
+	mov.b64 	%fd72, {%r49, %r48};
+	bra.uni 	BB7_76;
+
+BB7_54:
+	setp.eq.s32	%p57, %r6, 6;
+	@%p57 bra 	BB7_55;
+	bra.uni 	BB7_87;
+
+BB7_55:
+	setp.le.f64	%p65, %fd1, %fd54;
+	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p65;
+	bra.uni 	BB7_87;
+
+BB7_66:
+	setp.neu.f64	%p61, %fd1, %fd54;
+	selp.f64	%fd74, 0d3FF0000000000000, 0d0000000000000000, %p61;
+	bra.uni 	BB7_87;
+
+BB7_63:
+	setp.ne.s32	%p50, %r6, 12;
+	@%p50 bra 	BB7_87;
+
+	max.f64 	%fd74, %fd1, %fd54;
+	bra.uni 	BB7_87;
+
+BB7_30:
+	setp.gt.s32	%p30, %r2, -1;
+	@%p30 bra 	BB7_33;
+
+	cvt.rzi.f64.f64	%fd56, %fd1;
+	setp.neu.f64	%p31, %fd56, %fd1;
+	selp.f64	%fd64, 0dFFF8000000000000, %fd64, %p31;
+
+BB7_33:
+	mov.f64 	%fd16, %fd64;
+	add.f64 	%fd17, %fd1, %fd54;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r28}, %fd17;
+	}
+	and.b32  	%r29, %r28, 2146435072;
+	setp.ne.s32	%p34, %r29, 2146435072;
+	mov.f64 	%fd63, %fd16;
+	@%p34 bra 	BB7_40;
+
+	setp.gtu.f64	%p35, %fd10, 0d7FF0000000000000;
+	mov.f64 	%fd63, %fd17;
+	@%p35 bra 	BB7_40;
+
+	abs.f64 	%fd18, %fd1;
+	setp.gtu.f64	%p36, %fd18, 0d7FF0000000000000;
+	mov.f64 	%fd62, %fd17;
+	mov.f64 	%fd63, %fd62;
+	@%p36 bra 	BB7_40;
+
+	setp.eq.f64	%p37, %fd18, 0d7FF0000000000000;
+	@%p37 bra 	BB7_39;
+	bra.uni 	BB7_37;
+
+BB7_39:
+	setp.gt.f64	%p39, %fd10, 0d3FF0000000000000;
+	selp.b32	%r36, 2146435072, 0, %p39;
+	xor.b32  	%r37, %r36, 2146435072;
+	setp.lt.s32	%p40, %r3, 0;
+	selp.b32	%r38, %r37, %r36, %p40;
+	setp.eq.f64	%p41, %fd54, 0dBFF0000000000000;
+	selp.b32	%r39, 1072693248, %r38, %p41;
+	mov.u32 	%r40, 0;
+	mov.b64 	%fd63, {%r40, %r39};
+	bra.uni 	BB7_40;
+
+BB7_73:
+	setp.gt.s32	%p70, %r4, -1;
+	@%p70 bra 	BB7_76;
+
+	cvt.rzi.f64.f64	%fd58, %fd54;
+	setp.neu.f64	%p71, %fd58, %fd54;
+	selp.f64	%fd72, 0dFFF8000000000000, %fd72, %p71;
+
+BB7_76:
+	mov.f64 	%fd42, %fd72;
+	add.f64 	%fd43, %fd1, %fd54;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r50}, %fd43;
+	}
+	and.b32  	%r51, %r50, 2146435072;
+	setp.ne.s32	%p74, %r51, 2146435072;
+	mov.f64 	%fd71, %fd42;
+	@%p74 bra 	BB7_83;
+
+	setp.gtu.f64	%p75, %fd36, 0d7FF0000000000000;
+	mov.f64 	%fd71, %fd43;
+	@%p75 bra 	BB7_83;
+
+	abs.f64 	%fd44, %fd54;
+	setp.gtu.f64	%p76, %fd44, 0d7FF0000000000000;
+	mov.f64 	%fd70, %fd43;
+	mov.f64 	%fd71, %fd70;
+	@%p76 bra 	BB7_83;
+
+	setp.eq.f64	%p77, %fd44, 0d7FF0000000000000;
+	@%p77 bra 	BB7_82;
+	bra.uni 	BB7_80;
+
+BB7_82:
+	setp.gt.f64	%p79, %fd36, 0d3FF0000000000000;
+	selp.b32	%r58, 2146435072, 0, %p79;
+	xor.b32  	%r59, %r58, 2146435072;
+	setp.lt.s32	%p80, %r5, 0;
+	selp.b32	%r60, %r59, %r58, %p80;
+	setp.eq.f64	%p81, %fd1, 0dBFF0000000000000;
+	selp.b32	%r61, 1072693248, %r60, %p81;
+	mov.u32 	%r62, 0;
+	mov.b64 	%fd71, {%r62, %r61};
+	bra.uni 	BB7_83;
+
+BB7_37:
+	setp.neu.f64	%p38, %fd10, 0d7FF0000000000000;
+	mov.f64 	%fd63, %fd16;
+	@%p38 bra 	BB7_40;
+
+	shr.s32 	%r30, %r3, 31;
+	and.b32  	%r31, %r30, -2146435072;
+	add.s32 	%r32, %r31, 2146435072;
+	or.b32  	%r33, %r32, -2147483648;
+	selp.b32	%r34, %r33, %r32, %p1;
+	mov.u32 	%r35, 0;
+	mov.b64 	%fd63, {%r35, %r34};
+
+BB7_40:
+	setp.eq.f64	%p42, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p43, %fd54, 0d3FF0000000000000;
+	or.pred  	%p44, %p43, %p42;
+	selp.f64	%fd66, 0d3FF0000000000000, %fd63, %p44;
+	bra.uni 	BB7_44;
+
+BB7_80:
+	setp.neu.f64	%p78, %fd36, 0d7FF0000000000000;
+	mov.f64 	%fd71, %fd42;
+	@%p78 bra 	BB7_83;
+
+	shr.s32 	%r52, %r5, 31;
+	and.b32  	%r53, %r52, -2146435072;
+	add.s32 	%r54, %r53, 2146435072;
+	or.b32  	%r55, %r54, -2147483648;
+	selp.b32	%r56, %r55, %r54, %p2;
+	mov.u32 	%r57, 0;
+	mov.b64 	%fd71, {%r57, %r56};
+
+BB7_83:
+	setp.eq.f64	%p82, %fd54, 0d0000000000000000;
+	setp.eq.f64	%p83, %fd1, 0d3FF0000000000000;
+	or.pred  	%p84, %p83, %p82;
+	selp.f64	%fd74, 0d3FF0000000000000, %fd71, %p84;
+	bra.uni 	BB7_87;
+}
+
+.func  (.param .b64 func_retval0) __internal_accurate_pow(
+	.param .b64 __internal_accurate_pow_param_0,
+	.param .b64 __internal_accurate_pow_param_1
+)
+{
+	.reg .pred 	%p<8>;
+	.reg .f32 	%f<3>;
+	.reg .b32 	%r<49>;
+	.reg .f64 	%fd<136>;
+
+
+	ld.param.f64 	%fd12, [__internal_accurate_pow_param_0];
+	ld.param.f64 	%fd13, [__internal_accurate_pow_param_1];
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r46}, %fd12;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r45, %temp}, %fd12;
+	}
+	shr.u32 	%r47, %r46, 20;
+	setp.ne.s32	%p1, %r47, 0;
+	@%p1 bra 	BB8_2;
+
+	mul.f64 	%fd14, %fd12, 0d4350000000000000;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r46}, %fd14;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r45, %temp}, %fd14;
+	}
+	shr.u32 	%r16, %r46, 20;
+	add.s32 	%r47, %r16, -54;
+
+BB8_2:
+	add.s32 	%r48, %r47, -1023;
+	and.b32  	%r17, %r46, -2146435073;
+	or.b32  	%r18, %r17, 1072693248;
+	mov.b64 	%fd134, {%r45, %r18};
+	setp.lt.u32	%p2, %r18, 1073127583;
+	@%p2 bra 	BB8_4;
+
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r19, %temp}, %fd134;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r20}, %fd134;
+	}
+	add.s32 	%r21, %r20, -1048576;
+	mov.b64 	%fd134, {%r19, %r21};
+	add.s32 	%r48, %r47, -1022;
+
+BB8_4:
+	add.f64 	%fd16, %fd134, 0d3FF0000000000000;
+	// inline asm
+	rcp.approx.ftz.f64 %fd15,%fd16;
+	// inline asm
+	neg.f64 	%fd17, %fd16;
+	mov.f64 	%fd18, 0d3FF0000000000000;
+	fma.rn.f64 	%fd19, %fd17, %fd15, %fd18;
+	fma.rn.f64 	%fd20, %fd19, %fd19, %fd19;
+	fma.rn.f64 	%fd21, %fd20, %fd15, %fd15;
+	add.f64 	%fd22, %fd134, 0dBFF0000000000000;
+	mul.f64 	%fd23, %fd22, %fd21;
+	fma.rn.f64 	%fd24, %fd22, %fd21, %fd23;
+	mul.f64 	%fd25, %fd24, %fd24;
+	mov.f64 	%fd26, 0d3ED0F5D241AD3B5A;
+	mov.f64 	%fd27, 0d3EB0F5FF7D2CAFE2;
+	fma.rn.f64 	%fd28, %fd27, %fd25, %fd26;
+	mov.f64 	%fd29, 0d3EF3B20A75488A3F;
+	fma.rn.f64 	%fd30, %fd28, %fd25, %fd29;
+	mov.f64 	%fd31, 0d3F1745CDE4FAECD5;
+	fma.rn.f64 	%fd32, %fd30, %fd25, %fd31;
+	mov.f64 	%fd33, 0d3F3C71C7258A578B;
+	fma.rn.f64 	%fd34, %fd32, %fd25, %fd33;
+	mov.f64 	%fd35, 0d3F6249249242B910;
+	fma.rn.f64 	%fd36, %fd34, %fd25, %fd35;
+	mov.f64 	%fd37, 0d3F89999999999DFB;
+	fma.rn.f64 	%fd38, %fd36, %fd25, %fd37;
+	sub.f64 	%fd39, %fd22, %fd24;
+	add.f64 	%fd40, %fd39, %fd39;
+	neg.f64 	%fd41, %fd24;
+	fma.rn.f64 	%fd42, %fd41, %fd22, %fd40;
+	mul.f64 	%fd43, %fd21, %fd42;
+	fma.rn.f64 	%fd44, %fd25, %fd38, 0d3FB5555555555555;
+	mov.f64 	%fd45, 0d3FB5555555555555;
+	sub.f64 	%fd46, %fd45, %fd44;
+	fma.rn.f64 	%fd47, %fd25, %fd38, %fd46;
+	add.f64 	%fd48, %fd47, 0d0000000000000000;
+	add.f64 	%fd49, %fd48, 0dBC46A4CB00B9E7B0;
+	add.f64 	%fd50, %fd44, %fd49;
+	sub.f64 	%fd51, %fd44, %fd50;
+	add.f64 	%fd52, %fd49, %fd51;
+	mul.rn.f64 	%fd53, %fd24, %fd24;
+	neg.f64 	%fd54, %fd53;
+	fma.rn.f64 	%fd55, %fd24, %fd24, %fd54;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r22, %temp}, %fd43;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r23}, %fd43;
+	}
+	add.s32 	%r24, %r23, 1048576;
+	mov.b64 	%fd56, {%r22, %r24};
+	fma.rn.f64 	%fd57, %fd24, %fd56, %fd55;
+	mul.rn.f64 	%fd58, %fd53, %fd24;
+	neg.f64 	%fd59, %fd58;
+	fma.rn.f64 	%fd60, %fd53, %fd24, %fd59;
+	fma.rn.f64 	%fd61, %fd53, %fd43, %fd60;
+	fma.rn.f64 	%fd62, %fd57, %fd24, %fd61;
+	mul.rn.f64 	%fd63, %fd50, %fd58;
+	neg.f64 	%fd64, %fd63;
+	fma.rn.f64 	%fd65, %fd50, %fd58, %fd64;
+	fma.rn.f64 	%fd66, %fd50, %fd62, %fd65;
+	fma.rn.f64 	%fd67, %fd52, %fd58, %fd66;
+	add.f64 	%fd68, %fd63, %fd67;
+	sub.f64 	%fd69, %fd63, %fd68;
+	add.f64 	%fd70, %fd67, %fd69;
+	add.f64 	%fd71, %fd24, %fd68;
+	sub.f64 	%fd72, %fd24, %fd71;
+	add.f64 	%fd73, %fd68, %fd72;
+	add.f64 	%fd74, %fd70, %fd73;
+	add.f64 	%fd75, %fd43, %fd74;
+	add.f64 	%fd76, %fd71, %fd75;
+	sub.f64 	%fd77, %fd71, %fd76;
+	add.f64 	%fd78, %fd75, %fd77;
+	xor.b32  	%r25, %r48, -2147483648;
+	mov.u32 	%r26, 1127219200;
+	mov.b64 	%fd79, {%r25, %r26};
+	mov.u32 	%r27, -2147483648;
+	mov.b64 	%fd80, {%r27, %r26};
+	sub.f64 	%fd81, %fd79, %fd80;
+	mov.f64 	%fd82, 0d3FE62E42FEFA39EF;
+	fma.rn.f64 	%fd83, %fd81, %fd82, %fd76;
+	neg.f64 	%fd84, %fd81;
+	fma.rn.f64 	%fd85, %fd84, %fd82, %fd83;
+	sub.f64 	%fd86, %fd85, %fd76;
+	sub.f64 	%fd87, %fd78, %fd86;
+	mov.f64 	%fd88, 0d3C7ABC9E3B39803F;
+	fma.rn.f64 	%fd89, %fd81, %fd88, %fd87;
+	add.f64 	%fd90, %fd83, %fd89;
+	sub.f64 	%fd91, %fd83, %fd90;
+	add.f64 	%fd92, %fd89, %fd91;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r28}, %fd13;
+	}
+	add.s32 	%r29, %r28, %r28;
+	setp.gt.u32	%p3, %r29, -33554433;
+	and.b32  	%r30, %r28, -15728641;
+	selp.b32	%r31, %r30, %r28, %p3;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r32, %temp}, %fd13;
+	}
+	mov.b64 	%fd93, {%r32, %r31};
+	mul.rn.f64 	%fd94, %fd90, %fd93;
+	neg.f64 	%fd95, %fd94;
+	fma.rn.f64 	%fd96, %fd90, %fd93, %fd95;
+	fma.rn.f64 	%fd97, %fd92, %fd93, %fd96;
+	add.f64 	%fd4, %fd94, %fd97;
+	sub.f64 	%fd98, %fd94, %fd4;
+	add.f64 	%fd5, %fd97, %fd98;
+	mov.f64 	%fd99, 0d3FF71547652B82FE;
+	mul.rn.f64 	%fd100, %fd4, %fd99;
+	mov.f64 	%fd101, 0d4338000000000000;
+	add.rn.f64 	%fd102, %fd100, %fd101;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r13, %temp}, %fd102;
+	}
+	mov.f64 	%fd103, 0dC338000000000000;
+	add.rn.f64 	%fd104, %fd102, %fd103;
+	mov.f64 	%fd105, 0dBFE62E42FEFA39EF;
+	fma.rn.f64 	%fd106, %fd104, %fd105, %fd4;
+	mov.f64 	%fd107, 0dBC7ABC9E3B39803F;
+	fma.rn.f64 	%fd108, %fd104, %fd107, %fd106;
+	mov.f64 	%fd109, 0d3E928AF3FCA213EA;
+	mov.f64 	%fd110, 0d3E5ADE1569CE2BDF;
+	fma.rn.f64 	%fd111, %fd110, %fd108, %fd109;
+	mov.f64 	%fd112, 0d3EC71DEE62401315;
+	fma.rn.f64 	%fd113, %fd111, %fd108, %fd112;
+	mov.f64 	%fd114, 0d3EFA01997C89EB71;
+	fma.rn.f64 	%fd115, %fd113, %fd108, %fd114;
+	mov.f64 	%fd116, 0d3F2A01A014761F65;
+	fma.rn.f64 	%fd117, %fd115, %fd108, %fd116;
+	mov.f64 	%fd118, 0d3F56C16C1852B7AF;
+	fma.rn.f64 	%fd119, %fd117, %fd108, %fd118;
+	mov.f64 	%fd120, 0d3F81111111122322;
+	fma.rn.f64 	%fd121, %fd119, %fd108, %fd120;
+	mov.f64 	%fd122, 0d3FA55555555502A1;
+	fma.rn.f64 	%fd123, %fd121, %fd108, %fd122;
+	mov.f64 	%fd124, 0d3FC5555555555511;
+	fma.rn.f64 	%fd125, %fd123, %fd108, %fd124;
+	mov.f64 	%fd126, 0d3FE000000000000B;
+	fma.rn.f64 	%fd127, %fd125, %fd108, %fd126;
+	fma.rn.f64 	%fd128, %fd127, %fd108, %fd18;
+	fma.rn.f64 	%fd129, %fd128, %fd108, %fd18;
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%r14, %temp}, %fd129;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r15}, %fd129;
+	}
+	shl.b32 	%r33, %r13, 20;
+	add.s32 	%r34, %r15, %r33;
+	mov.b64 	%fd135, {%r14, %r34};
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r35}, %fd4;
+	}
+	mov.b32 	 %f2, %r35;
+	abs.f32 	%f1, %f2;
+	setp.lt.f32	%p4, %f1, 0f4086232B;
+	@%p4 bra 	BB8_7;
+
+	setp.lt.f64	%p5, %fd4, 0d0000000000000000;
+	add.f64 	%fd130, %fd4, 0d7FF0000000000000;
+	selp.f64	%fd135, 0d0000000000000000, %fd130, %p5;
+	setp.geu.f32	%p6, %f1, 0f40874800;
+	@%p6 bra 	BB8_7;
+
+	shr.u32 	%r36, %r13, 31;
+	add.s32 	%r37, %r13, %r36;
+	shr.s32 	%r38, %r37, 1;
+	shl.b32 	%r39, %r38, 20;
+	add.s32 	%r40, %r39, %r15;
+	mov.b64 	%fd131, {%r14, %r40};
+	sub.s32 	%r41, %r13, %r38;
+	shl.b32 	%r42, %r41, 20;
+	add.s32 	%r43, %r42, 1072693248;
+	mov.u32 	%r44, 0;
+	mov.b64 	%fd132, {%r44, %r43};
+	mul.f64 	%fd135, %fd131, %fd132;
+
+BB8_7:
+	abs.f64 	%fd133, %fd135;
+	setp.eq.f64	%p7, %fd133, 0d7FF0000000000000;
+	@%p7 bra 	BB8_9;
+
+	fma.rn.f64 	%fd135, %fd135, %fd5, %fd135;
+
+BB8_9:
+	st.param.f64	[func_retval0+0], %fd135;
+	ret;
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index 4fc8d40..4a70313 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -71,6 +71,11 @@ public class DMLConfig
 	public static final String CP_PARALLEL_MATRIXMULT = "cp.parallel.matrixmult";
 	public static final String CP_PARALLEL_TEXTIO   = "cp.parallel.textio";
 	public static final String COMPRESSED_LINALG    = "compressed.linalg";
+	// Fraction of available memory to use. The available memory is computer when the JCudaContext is created
+	// to handle the tradeoff on calling cudaMemGetInfo too often.
+	public static final String GPU_MEMORY_UTILIZATION_FACTOR    = "gpu.memory.util.factor";
+	// Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application. 
+	public static final String REFRESH_AVAILABLE_MEMORY_EVERY_TIME    = "gpu.memory.refresh";
 
 	// supported prefixes for custom map/reduce configurations
 	public static final String PREFIX_MAPRED = "mapred";
@@ -102,6 +107,8 @@ public class DMLConfig
 		_defaultVals.put(CP_PARALLEL_MATRIXMULT, "true" );
 		_defaultVals.put(CP_PARALLEL_TEXTIO,     "true" );
 		_defaultVals.put(COMPRESSED_LINALG,      "false" );
+		_defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
+		_defaultVals.put(REFRESH_AVAILABLE_MEMORY_EVERY_TIME,      "true" );
 	}
 	
 	public DMLConfig()
@@ -236,6 +243,10 @@ public class DMLConfig
 		return Boolean.parseBoolean( getTextValue(tagName) );
 	}
 	
+	public double getDoubleValue( String tagName )
+	{
+		return Double.parseDouble( getTextValue(tagName) );
+	}
 	
 	/**
 	 * Method to get the string value of an element identified by a tag name

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
index 5eac832..1f1a8fd 100644
--- a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
@@ -553,9 +553,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop
 		int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
 		
 		ExecType et = ExecType.CP;
-//		if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) {
-		//TODO: Fix me. Currently forcing the instruction to GPU if gpu flag is set
-		if(DMLScript.USE_ACCELERATOR) {
+		if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) {
 			et = ExecType.GPU;
 		}
 		

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/hops/BinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/BinaryOp.java b/src/main/java/org/apache/sysml/hops/BinaryOp.java
index dc8c8a5..24d9f7c 100644
--- a/src/main/java/org/apache/sysml/hops/BinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/BinaryOp.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysml.hops;
 
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.lops.Aggregate;
@@ -569,7 +570,10 @@ public class BinaryOp extends Hop
 			else //general case
 				ot = HopsOpOp2LopsU.get(op);
 			
-			
+			if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET) 
+					&& (op == OpOp2.MULT || op == OpOp2.PLUS || op == OpOp2.MINUS || op == OpOp2.DIV || op == OpOp2.POW) ) {
+				et = ExecType.GPU;
+			}
 			Unary unary1 = new Unary(getInput().get(0).constructLops(),
 						   getInput().get(1).constructLops(), ot, getDataType(), getValueType(), et);
 		
@@ -584,6 +588,11 @@ public class BinaryOp extends Hop
 			ExecType et = optFindExecType();
 			if ( et == ExecType.CP ) 
 			{
+				if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET) 
+						&& (op == OpOp2.MULT || op == OpOp2.PLUS || op == OpOp2.MINUS || op == OpOp2.DIV || op == OpOp2.POW)) {
+					et = ExecType.GPU;
+				}
+				
 				Binary binary = new Binary(getInput().get(0).constructLops(), getInput().get(1).constructLops(), HopsOpOp2LopsB.get(op),
 						getDataType(), getValueType(), et);
 				

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/hops/ReorgOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ReorgOp.java b/src/main/java/org/apache/sysml/hops/ReorgOp.java
index 21b3069..abe03a8 100644
--- a/src/main/java/org/apache/sysml/hops/ReorgOp.java
+++ b/src/main/java/org/apache/sysml/hops/ReorgOp.java
@@ -21,6 +21,7 @@ package org.apache.sysml.hops;
 
 import java.util.ArrayList;
 
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.Hop.MultiThreadedHop;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
@@ -132,6 +133,9 @@ public class ReorgOp extends Hop implements MultiThreadedHop
 					setLops(lin); //if input of size 1x1, avoid unnecessary transpose
 				else { //general case
 					int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
+					if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) {
+						et = ExecType.GPU;
+					}
 					Transform transform1 = new Transform( lin, 
 							HopsTransf2Lops.get(op), getDataType(), getValueType(), et, k);
 					setOutputDimensions(transform1);
@@ -684,4 +688,4 @@ public class ReorgOp extends Hop implements MultiThreadedHop
 		
 		return ret;
 	}	
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index 96147b9..cef0432 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -297,12 +297,15 @@ public class ExecutionContext
 		if( mo.getGPUObject() == null ) {
 			mo.setGPUObject(GPUContext.createGPUObject(mo));
 		}
+		boolean acquired = false;
 		if( !mo.getGPUObject().isAllocated ) {
 			mo.acquireRead();
-			mo.release();
-			//FIXME: after release the matrix block might get evicted
+			acquired = true;
 		}
 		mo.getGPUObject().acquireDeviceRead();
+		if(acquired) {
+			mo.release();
+		}
 		return mo;
 	}
 	

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ProgramConverter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ProgramConverter.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ProgramConverter.java
index 16d00b9..c7f54c3 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ProgramConverter.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ProgramConverter.java
@@ -75,6 +75,7 @@ import org.apache.sysml.runtime.instructions.cp.IntObject;
 import org.apache.sysml.runtime.instructions.cp.ScalarObject;
 import org.apache.sysml.runtime.instructions.cp.StringObject;
 import org.apache.sysml.runtime.instructions.cp.VariableCPInstruction;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.runtime.instructions.mr.MRInstruction;
 import org.apache.sysml.runtime.instructions.spark.SPInstruction;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
@@ -517,7 +518,8 @@ public class ProgramConverter
 		
 		try
 		{
-			if( oInst instanceof CPInstruction || oInst instanceof SPInstruction || oInst instanceof MRInstruction )
+			if( oInst instanceof CPInstruction || oInst instanceof SPInstruction || oInst instanceof MRInstruction 
+					|| oInst instanceof GPUInstruction )
 			{
 				if( oInst instanceof FunctionCallCPInstruction && cpFunctions )
 				{

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 20527df..96b82de 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -22,10 +22,12 @@ import java.util.HashMap;
 
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.instructions.gpu.AggregateBinaryGPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.ArithmeticBinaryGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.ConvolutionGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction.GPUINSTRUCTION_TYPE;
 import org.apache.sysml.runtime.instructions.gpu.MMTSJGPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.ReorgGPUInstruction;
 
 public class GPUInstructionParser  extends InstructionParser 
 {
@@ -39,6 +41,22 @@ public class GPUInstructionParser  extends InstructionParser
 		String2GPUInstructionType.put( "maxpooling_backward",    GPUINSTRUCTION_TYPE.Convolution);
 		String2GPUInstructionType.put( "ba+*",                   GPUINSTRUCTION_TYPE.AggregateBinary);
 		String2GPUInstructionType.put( "tsmm",                   GPUINSTRUCTION_TYPE.MMTSJ);
+		String2GPUInstructionType.put( "r'",                   	 GPUINSTRUCTION_TYPE.Reorg);
+	
+		// 
+		String2GPUInstructionType.put( "+"    , GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "-"    , GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "*"    , GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "/"    , GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "%%"   , GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "%/%"  , GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "^"    , GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "1-*"  , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
+		String2GPUInstructionType.put( "^2"   , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special ^ case
+		String2GPUInstructionType.put( "*2"   , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
+		String2GPUInstructionType.put( "-nz"  , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special - case
+		String2GPUInstructionType.put( "+*"  , GPUINSTRUCTION_TYPE.ArithmeticBinary); 
+		String2GPUInstructionType.put( "-*"  , GPUINSTRUCTION_TYPE.ArithmeticBinary); 
 	}
 	
 	public static GPUInstruction parseSingleInstruction (String str ) 
@@ -74,6 +92,12 @@ public class GPUInstructionParser  extends InstructionParser
 			case MMTSJ:
 				return MMTSJGPUInstruction.parseInstruction(str);
 				
+			case Reorg:
+				return ReorgGPUInstruction.parseInstruction(str);
+				
+			case ArithmeticBinary:
+				return ArithmeticBinaryGPUInstruction.parseInstruction(str);
+				
 			default: 
 				throw new DMLRuntimeException("Invalid GPU Instruction Type: " + gputype );
 		}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/runtime/instructions/gpu/ArithmeticBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ArithmeticBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ArithmeticBinaryGPUInstruction.java
new file mode 100644
index 0000000..6d76a7f
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ArithmeticBinaryGPUInstruction.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.instructions.gpu;
+
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.matrix.operators.Operator;
+
+public abstract class ArithmeticBinaryGPUInstruction extends GPUInstruction {
+
+	protected CPOperand _input1;
+	protected CPOperand _input2;
+	protected CPOperand _output;
+
+	public ArithmeticBinaryGPUInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr) {
+		super(op, opcode, istr);
+		_gputype = GPUINSTRUCTION_TYPE.ArithmeticBinary;
+		_input1 = in1;
+		_input2 = in2;
+	    _output = out;
+	}
+	
+	public static ArithmeticBinaryGPUInstruction parseInstruction ( String str ) throws DMLRuntimeException {
+		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
+		InstructionUtils.checkNumFields ( parts, 3 );
+		
+		String opcode = parts[0];
+		CPOperand in1 = new CPOperand(parts[1]);
+		CPOperand in2 = new CPOperand(parts[2]);
+		CPOperand out = new CPOperand(parts[3]);
+		
+		DataType dt1 = in1.getDataType();
+		DataType dt2 = in2.getDataType();
+		DataType dt3 = out.getDataType();
+	 
+		Operator operator = (dt1 != dt2) ?
+				InstructionUtils.parseScalarBinaryOperator(opcode, (dt1 == DataType.SCALAR)) : 
+				InstructionUtils.parseBinaryOperator(opcode);
+		
+		if(dt1 == DataType.MATRIX && dt2 == DataType.MATRIX && dt3 == DataType.MATRIX) {
+			return new MatrixMatrixArithmeticGPUInstruction(operator, in1, in2, out, opcode, str);	
+		}
+		else if( dt3 == DataType.MATRIX && ((dt1 == DataType.SCALAR && dt2 == DataType.MATRIX) || (dt1 == DataType.MATRIX && dt2 == DataType.SCALAR)) ) {
+			return new ScalarMatrixArithmeticGPUInstruction(operator, in1, in2, out, opcode, str);
+		}
+		else
+			throw new DMLRuntimeException("Unsupported GPU ArithmeticInstruction.");
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index d842ac8..b2f2eb2 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -28,7 +28,7 @@ import org.apache.sysml.runtime.matrix.operators.Operator;
 
 public abstract class GPUInstruction extends Instruction 
 {
-	public enum GPUINSTRUCTION_TYPE { AggregateBinary, Convolution, MMTSJ }; 
+	public enum GPUINSTRUCTION_TYPE { AggregateBinary, Convolution, MMTSJ, Reorg, ArithmeticBinary }; 
 	
 	protected GPUINSTRUCTION_TYPE _gputype;
 	protected Operator _optr;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
index b3fdae2..4c05833 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
@@ -107,8 +107,7 @@ public class MMTSJGPUInstruction extends GPUInstruction
 
                 //execute operations 
                 ec.setMetaData(_output.getName(), rlen, clen);
-                MatrixObject out = ec.getDenseMatrixOutputForGPUInstruction(_output.getName());
-                LibMatrixCUDA.matmultTSMM(mat, out, isLeftTransposed);
+                LibMatrixCUDA.matmultTSMM(ec, mat, _output.getName(), isLeftTransposed);
                 
                 ec.releaseMatrixInputForGPUInstruction(_input.getName());
                 ec.releaseMatrixOutputForGPUInstruction(_output.getName());
@@ -122,4 +121,4 @@ public class MMTSJGPUInstruction extends GPUInstruction
         {
                 return _type;
         }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
new file mode 100644
index 0000000..558aa4d
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.instructions.gpu;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysml.runtime.matrix.operators.Operator;
+import org.apache.sysml.utils.Statistics;
+
+public class MatrixMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUInstruction
+{
+	
+	public MatrixMatrixArithmeticGPUInstruction(Operator op, 
+											   CPOperand in1, 
+											   CPOperand in2, 
+											   CPOperand out, 
+											   String opcode,
+											   String istr){
+		super(op, in1, in2, out, opcode, istr);
+	}
+	
+	@Override
+	public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
+		Statistics.incrementNoOfExecutedGPUInst();
+		
+		MatrixObject in1 = ec.getMatrixInputForGPUInstruction(_input1.getName());
+		MatrixObject in2 = ec.getMatrixInputForGPUInstruction(_input2.getName());
+		
+		//TODO: make hop level changes for this
+		boolean isLeftTransposed = false;
+		boolean isRightTransposed = false;
+		int rlen = isLeftTransposed ? (int) in1.getNumColumns() : (int) in1.getNumRows();
+		int clen = isLeftTransposed ? (int) in1.getNumRows() : (int) in1.getNumColumns();
+		
+		ec.setMetaData(_output.getName(), rlen, clen);
+		
+		BinaryOperator bop = (BinaryOperator) _optr;
+		LibMatrixCUDA.bincellOp(ec, in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop);
+		
+		ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+		ec.releaseMatrixInputForGPUInstruction(_input2.getName());
+        ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
new file mode 100644
index 0000000..b126b78
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.instructions.gpu;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.functionobjects.SwapIndex;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.operators.Operator;
+import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
+import org.apache.sysml.utils.Statistics;
+
+
+public class ReorgGPUInstruction extends GPUInstruction
+{
+ 	private CPOperand _input;
+    private CPOperand _output;
+ 	/**
+ 	 * for opcodes r'
+ 	 * 
+ 	 * @param op
+ 	 * @param in
+ 	 * @param out
+ 	 * @param opcode
+ 	 * @param istr
+ 	 */
+	public ReorgGPUInstruction(Operator op, CPOperand in, CPOperand out, String opcode, String istr) {
+		super(op, opcode, istr);
+		_gputype = GPUINSTRUCTION_TYPE.Reorg;
+		_input = in;
+        _output = out;
+	}
+	
+	public static ReorgGPUInstruction parseInstruction ( String str ) 
+		throws DMLRuntimeException 
+	{
+		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
+        InstructionUtils.checkNumFields ( parts, 2 );
+        
+		String opcode = parts[0];
+		CPOperand in = new CPOperand(parts[1]);
+		CPOperand out = new CPOperand(parts[2]);	
+					
+		if ( !(opcode.equalsIgnoreCase("r'")) ) {
+			throw new DMLRuntimeException("Unknown opcode while parsing a ReorgInstruction: " + str);
+		}
+		else
+			return new ReorgGPUInstruction(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, str);
+	}
+	
+	@Override
+	public void processInstruction(ExecutionContext ec)
+			throws DMLRuntimeException 
+	{
+		Statistics.incrementNoOfExecutedGPUInst();
+		//acquire input
+		MatrixObject mat = ec.getMatrixInputForGPUInstruction(_input.getName());	
+
+		int rlen = (int) mat.getNumColumns();
+		int clen = (int) mat.getNumRows();
+		
+		//execute operation
+		ec.setMetaData(_output.getName(), rlen, clen);
+		LibMatrixCUDA.transpose(ec, mat, _output.getName());
+		
+		//release inputs/outputs
+		ec.releaseMatrixInputForGPUInstruction(_input.getName());
+        ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
new file mode 100644
index 0000000..0832591
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.instructions.gpu;
+
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysml.runtime.matrix.operators.LeftScalarOperator;
+import org.apache.sysml.runtime.matrix.operators.Operator;
+import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.functionobjects.Multiply;
+import org.apache.sysml.runtime.functionobjects.Multiply2;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.utils.Statistics;
+
+public class ScalarMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUInstruction {
+	public ScalarMatrixArithmeticGPUInstruction(Operator op, 
+			   									CPOperand in1, 
+			   									CPOperand in2, 
+			   									CPOperand out, 
+			   									String opcode,
+			   									String istr){
+		super(op, in1, in2, out, opcode, istr);
+	}
+	
+	@Override
+	public void processInstruction(ExecutionContext ec) 
+		throws DMLRuntimeException
+	{
+		Statistics.incrementNoOfExecutedGPUInst();
+		
+		CPOperand mat = ( _input1.getDataType() == DataType.MATRIX ) ? _input1 : _input2;
+		CPOperand scalar = ( _input1.getDataType() == DataType.MATRIX ) ? _input2 : _input1;
+		MatrixObject in1 = ec.getMatrixInputForGPUInstruction(mat.getName());
+		ScalarObject constant = (ScalarObject) ec.getScalarInput(scalar.getName(), scalar.getValueType(), scalar.isLiteral());
+		
+		boolean isTransposed = false;
+		int rlen = isTransposed ? (int) in1.getNumColumns() : (int) in1.getNumRows();
+		int clen = isTransposed ? (int) in1.getNumRows() : (int) in1.getNumColumns();
+		
+		ec.setMetaData(_output.getName(), rlen, clen);
+		
+		ScalarOperator sc_op = (ScalarOperator) _optr;
+		sc_op.setConstant(constant.getDoubleValue());
+		
+		LibMatrixCUDA.bincellOp(ec, in1, _output.getName(), isTransposed, sc_op);
+		
+		ec.releaseMatrixInputForGPUInstruction(mat.getName());
+        ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
new file mode 100644
index 0000000..596d837
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+
+import jcuda.driver.CUstream;
+
+/**
+ * Java Wrapper to specify CUDA execution configuration for launching custom kernels
+ */
+public class ExecutionConfig {
+	public int gridDimX; 
+	public int gridDimY = 1;
+	public int gridDimZ = 1;
+	public int blockDimX; 
+	public int blockDimY = 1;
+	public int blockDimZ = 1;
+	public int sharedMemBytes = 0;
+	public CUstream stream = null;
+	
+//	private static HashMap<Integer, Integer> maxBlockDimXForDevice = new HashMap<Integer, Integer>();
+//	private static HashMap<Integer, Integer> maxBlockDimYForDevice = new HashMap<Integer, Integer>();
+	
+	/**
+	 * Use this for simple vector operations and use following in the kernel 
+	 * <code> 
+	 * int index = blockIdx.x * blockDim.x + threadIdx.x 
+	 * </code>
+	 * 
+	 * This tries to schedule as minimum grids as possible.
+	 * 
+	 * @param numCells
+	 * @return
+	 * @throws DMLRuntimeException
+	 */
+	public static ExecutionConfig getConfigForSimpleVectorOperations(int numCells) throws DMLRuntimeException {
+		int deviceNumber = 0;
+		int blockDimX = getMaxBlockDimX(deviceNumber);
+		int gridDimX = (int)Math.ceil((double)numCells / blockDimX);
+		return new ExecutionConfig(gridDimX, blockDimX);
+	}
+	
+	/**
+	 * Use this for simple matrix operations and use following in the kernel 
+	 * <code> 
+	 * int ix = blockIdx.x * blockDim.x + threadIdx.x;
+	 * int iy = blockIdx.y * blockDim.y + threadIdx.y;
+	 * </code>
+	 * 
+	 * This tries to schedule as minimum grids as possible.
+	 * 
+	 * @param numCells
+	 * @return
+	 * @throws DMLRuntimeException
+	 */
+	public static ExecutionConfig getConfigForSimpleMatrixOperations(int rlen, int clen) throws DMLRuntimeException {
+		int deviceNumber = 0;
+		int blockDimX = (int) Math.min(getMaxBlockDimX(deviceNumber), rlen);
+		int gridDimX = (int)Math.ceil((double)rlen / blockDimX);
+		int blockDimY = (int)Math.min(getMaxBlockDimY(deviceNumber), clen);
+		int gridDimY = (int)Math.ceil((double)clen / blockDimY);
+		return new ExecutionConfig(gridDimX, gridDimY, blockDimX, blockDimY);
+	}
+	
+	public ExecutionConfig(int gridDimX, int blockDimX) {
+		this.gridDimX = gridDimX;
+		this.blockDimX = blockDimX;
+	}
+	
+	public ExecutionConfig(int gridDimX, int gridDimY, int blockDimX, int blockDimY) {
+		this.gridDimX = gridDimX;
+		this.gridDimY = gridDimY;
+		this.blockDimX = blockDimX;
+		this.blockDimY = blockDimY;
+	}
+	
+	
+	/**
+     * Get the CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X of the given device
+     * 
+     * @return The maximum block dimension, in x-direction
+	 * @throws DMLRuntimeException 
+     */
+    private static int getMaxBlockDimX(int deviceNumber) throws DMLRuntimeException {
+    	return 32;
+    	// TODO: Use JCudaDriver.cuOccupancyMaxPotentialBlockSize to chose the block size that maximizes occupancy
+//    	Integer ret = maxBlockDimXForDevice.get(deviceNumber);
+//    	if(ret == null) {
+//    		CUdevice device = new CUdevice();
+//            JCudaKernels.checkResult(cuDeviceGet(device, deviceNumber));
+//            int maxBlockDimX[] =  {0};
+//            cuDeviceGetAttribute(maxBlockDimX, CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device);
+//            maxBlockDimXForDevice.put(deviceNumber, maxBlockDimX[0]);
+//            return maxBlockDimX[0];
+//    	}
+//        return ret;
+    }
+    
+    /**
+     * Get the CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y of the given device
+     * 
+     * @return The maximum block dimension, in y-direction
+	 * @throws DMLRuntimeException 
+     */
+    private static int getMaxBlockDimY(int deviceNumber) throws DMLRuntimeException {
+    	return 32;
+    	// TODO: Use JCudaDriver.cuOccupancyMaxPotentialBlockSize to chose the block size that maximizes occupancy
+//    	Integer ret = maxBlockDimYForDevice.get(deviceNumber);
+//    	if(ret == null) {
+//    		CUdevice device = new CUdevice();
+//            JCudaKernels.checkResult(cuDeviceGet(device, deviceNumber));
+//            int maxBlockDimY[] =  {0};
+//            cuDeviceGetAttribute(maxBlockDimY, CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device);
+//            maxBlockDimYForDevice.put(deviceNumber, maxBlockDimY[0]);
+//            return maxBlockDimY[0];
+//    	}
+//        return ret;
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 06f5f1a..6e8d7ea 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -40,8 +40,9 @@ public abstract class GPUContext {
 	 * Creation / Destruction of GPUContext and related handles
 	 * 
 	 * @return GPU context
+	 * @throws DMLRuntimeException 
 	 */
-	public static GPUContext createGPUContext() {
+	public static GPUContext createGPUContext() throws DMLRuntimeException {
 		if(currContext == null && DMLScript.USE_ACCELERATOR) {
 			synchronized(isGPUContextCreated) {
 				currContext = new JCudaContext();
@@ -66,4 +67,4 @@ public abstract class GPUContext {
 	public abstract void destroy() throws DMLRuntimeException;
 	
 	
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bfc0e0dc/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
index bc16a05..d94532c 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
@@ -22,6 +22,8 @@ import java.util.concurrent.atomic.AtomicLong;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.utils.Statistics;
@@ -34,7 +36,6 @@ import jcuda.runtime.JCuda;
 import jcuda.jcudnn.cudnnHandle;
 import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseHandle;
-
 import static jcuda.jcudnn.JCudnn.cudnnCreate;
 import static jcuda.jcublas.JCublas2.cublasCreate;
 import static jcuda.jcublas.JCublas2.cublasDestroy;
@@ -62,10 +63,10 @@ public class JCudaContext extends GPUContext {
 	public static long totalNumBytes = 0;
 	public static AtomicLong availableNumBytesWithoutUtilFactor = new AtomicLong(0);
 	// Fraction of available memory to use. The available memory is computer when the JCudaContext is created
-	// to handle the tradeoff on calling cudaMemGetInfo too often. 
-	public static double GPU_MEMORY_UTILIZATION_FACTOR = 0.9; 
-	public static boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = true;
-	
+	// to handle the tradeoff on calling cudaMemGetInfo too often.
+	public boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.REFRESH_AVAILABLE_MEMORY_EVERY_TIME);
+	// Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application.
+	public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
 	static {
 		long start = System.nanoTime();
 		JCuda.setExceptionsEnabled(true);
@@ -98,7 +99,7 @@ public class JCudaContext extends GPUContext {
 	}
 	
 	
-	public JCudaContext() {
+	public JCudaContext() throws DMLRuntimeException {
 		if(isGPUContextCreated) {
 			// Wait until it is deleted. This case happens during multi-threaded testing.
 			// This also allows for multi-threaded execute calls
@@ -139,6 +140,8 @@ public class JCudaContext extends GPUContext {
         }
         LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB");
         LOG.info("Available GPU memory: " + (availableNumBytesWithoutUtilFactor.get()*(1e-6)) + " MB");
+        
+        LibMatrixCUDA.kernels = new JCudaKernels();
 	}
 
 	@Override