You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ma...@apache.org on 2020/11/07 23:49:00 UTC

[systemds] branch master updated: [SYSTEMDS-2691, 2692, 2698] Initial SPOOF CUDA; Refactoring of CUDA kernels

This is an automated email from the ASF dual-hosted git repository.

markd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 52f2b3e  [SYSTEMDS-2691, 2692, 2698] Initial SPOOF CUDA; Refactoring of CUDA kernels
52f2b3e is described below

commit 52f2b3e27cc012272fe2ead3a3d0cde49983d4fb
Author: Mark Dokter <ma...@dokter.cc>
AuthorDate: Sun Nov 8 00:47:15 2020 +0100

    [SYSTEMDS-2691, 2692, 2698] Initial SPOOF CUDA; Refactoring of CUDA kernels
    
    Changelog from squashed commits:
    
    * Refactor CUDA codebase to its own directory
    * JNI parts for CUDA codegen and more code reorganization
    * SpoofCompiler CUDA loading and compiler invocation
    * Cellwise code template and other CPlan templates for operators (*nary/data/...)
    * SpoofCUDA runtime instruction and also:
    * Import jitify as submodule
    * Template loading from JAR file
    * Configuration tag <sysds.codegen.api> to use CUDA codegen (see SystemDS-config.xml.template)
    
    Closes #1092
---
 .gitmodules                                        |    3 +
 conf/SystemDS-config.xml.template                  |    9 +-
 pom.xml                                            |   33 +-
 src/main/cpp/kernels/Makefile                      |   40 -
 .../cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so |  Bin 0 -> 1008760 bytes
 .../lib/libsystemds_spoof_cuda-Windows-AMD64.dll   |  Bin 0 -> 220672 bytes
 src/main/cuda/CMakeLists.txt                       |  111 +
 src/main/cuda/ext/jitify                           |    1 +
 src/main/{cpp/kernels => cuda/headers}/agg_ops.cuh |   40 +-
 src/main/{cpp/kernels => cuda/headers}/cum_max.cuh |    9 +-
 src/main/{cpp/kernels => cuda/headers}/cum_min.cuh |    7 +-
 .../{cpp/kernels => cuda/headers}/cum_prod.cuh     |    7 +-
 .../{cpp/kernels => cuda/headers}/cum_scan.cuh     |   10 +-
 src/main/{cpp/kernels => cuda/headers}/cum_sum.cuh |    7 +-
 .../{cpp/kernels => cuda/headers}/cum_sum_prod.cuh |    8 +-
 src/main/cuda/headers/reduction.cuh                |  314 ++
 src/main/cuda/headers/spoof_utils.cuh              |   94 +
 src/main/{cpp/kernels => cuda/headers}/utils.cuh   |   23 +-
 src/main/{cpp => cuda}/kernels/SystemDS.cu         |   17 -
 src/main/{cpp => cuda}/kernels/SystemDS.ptx        |    0
 src/main/cuda/kernels/reduction.cu                 |  282 ++
 src/main/cuda/kernels/reduction.ptx                | 3546 ++++++++++++++++++++
 src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp  |  141 +
 src/main/cuda/spoof-launcher/SpoofCUDAContext.h    |  269 ++
 src/main/cuda/spoof-launcher/host_utils.h          |   48 +
 src/main/cuda/spoof-launcher/jni_bridge.cpp        |  109 +
 src/main/cuda/spoof-launcher/jni_bridge.h          |   81 +
 src/main/cuda/spoof/cellwise.cu                    |   54 +
 src/main/cuda/spoof/functions.cuh                  |   86 +
 src/main/java/org/apache/sysds/api/DMLScript.java  |   33 +-
 .../apache/sysds/conf/ConfigurationManager.java    |   12 +-
 src/main/java/org/apache/sysds/conf/DMLConfig.java |    5 +-
 .../apache/sysds/hops/codegen/SpoofCompiler.java   |  209 +-
 .../apache/sysds/hops/codegen/SpoofFusedOp.java    |   46 +-
 .../org/apache/sysds/hops/codegen/cplan/CNode.java |   42 +-
 .../sysds/hops/codegen/cplan/CNodeBinary.java      |  175 +-
 .../apache/sysds/hops/codegen/cplan/CNodeCell.java |  102 +-
 .../apache/sysds/hops/codegen/cplan/CNodeData.java |   49 +-
 .../sysds/hops/codegen/cplan/CNodeMultiAgg.java    |   19 +-
 .../apache/sysds/hops/codegen/cplan/CNodeNary.java |   22 +-
 .../hops/codegen/cplan/CNodeOuterProduct.java      |   17 +-
 .../apache/sysds/hops/codegen/cplan/CNodeRow.java  |   18 +-
 .../sysds/hops/codegen/cplan/CNodeTernary.java     |   62 +-
 .../apache/sysds/hops/codegen/cplan/CNodeTpl.java  |   12 +-
 .../sysds/hops/codegen/cplan/CNodeUnary.java       |  121 +-
 .../sysds/hops/codegen/cplan/CodeTemplate.java}    |   38 +-
 .../sysds/hops/codegen/cplan/cpp/Binary.java       |  322 ++
 .../sysds/hops/codegen/cplan/cpp/CellWise.java     |   76 +
 .../sysds/hops/codegen/cplan/cpp/Ternary.java      |  130 +
 .../apache/sysds/hops/codegen/cplan/cpp/Unary.java |  258 ++
 .../sysds/hops/codegen/cplan/java/Binary.java      |  200 ++
 .../sysds/hops/codegen/cplan/java/CellWise.java    |   79 +
 .../sysds/hops/codegen/cplan/java/Ternary.java     |   89 +
 .../sysds/hops/codegen/cplan/java/Unary.java       |  152 +
 .../java/org/apache/sysds/lops/SpoofFused.java     |   30 +-
 .../apache/sysds/runtime/codegen/CodegenUtils.java |   14 +-
 .../apache/sysds/runtime/codegen/SpoofCUDA.java    |  121 +
 .../sysds/runtime/codegen/SpoofOperator.java       |    9 +-
 .../controlprogram/context/ExecutionContext.java   |   26 +-
 .../runtime/functionobjects/IntegerDivide.java     |    4 +-
 .../runtime/instructions/GPUInstructionParser.java |   10 +-
 .../instructions/cp/SpoofCPInstruction.java        |    4 +-
 .../runtime/instructions/gpu/GPUInstruction.java   |    7 +-
 .../instructions/gpu/SpoofCUDAInstruction.java     |  119 +
 .../instructions/gpu/context/GPUContextPool.java   |    6 +-
 .../instructions/gpu/context/GPUObject.java        |   19 +
 .../instructions/gpu/context/JCudaKernels.java     |    2 +-
 .../instructions/spark/SpoofSPInstruction.java     |    6 +-
 .../sysds/runtime/matrix/data/LibMatrixNative.java |    2 +-
 .../java/org/apache/sysds/utils/NativeHelper.java  |    8 +-
 .../test/functions/codegen/CellwiseTmplTest.java   |    6 +-
 src/test/resources/log4j.properties                |    1 +
 .../functions/codegen/SystemDS-config-codegen.xml  |    2 +
 73 files changed, 7451 insertions(+), 582 deletions(-)

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..8d14805
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "jitify"]
+	path = src/main/cuda/ext/jitify
+	url = git@github.com:NVIDIA/jitify.git
diff --git a/conf/SystemDS-config.xml.template b/conf/SystemDS-config.xml.template
index 7fa4d98..171cfb3 100644
--- a/conf/SystemDS-config.xml.template
+++ b/conf/SystemDS-config.xml.template
@@ -41,12 +41,15 @@
     
     <!-- enables operator fusion via code generation, experimental feature -->
     <sysds.codegen.enabled>false</sysds.codegen.enabled>
-    
-    <!-- set the codegen java compiler (auto, janino, javac) -->
+
+    <!-- set the codegen API (auto, java, cuda) -->
+   <sysds.codegen.api>auto</sysds.codegen.api>
+
+    <!-- set the codegen java compiler (auto, janino, javac, nvcc, nvrtc) -->
     <sysds.codegen.compiler>auto</sysds.codegen.compiler>
 
     <!-- set the codegen optimizer (fuse_all, fuse_no_redundancy, fuse_cost_based_v2) -->
-    <sysds.codegen.compiler>fuse_cost_based_v2</sysds.codegen.compiler>
+    <sysds.codegen.optimizer>fuse_cost_based_v2</sysds.codegen.optimizer>
     
     <!-- if codegen.enabled, enables source code caching of fused operators -->
     <sysds.codegen.plancache>true</sysds.codegen.plancache>
diff --git a/pom.xml b/pom.xml
index 5f3da30..4027916 100644
--- a/pom.xml
+++ b/pom.xml
@@ -87,18 +87,31 @@
 				<targetPath>scripts</targetPath>
 			</resource>
 			<resource>
-				<directory>src/main/cpp/kernels</directory>
-				<excludes>
-					<exclude>*.cu</exclude>
-					<exclude>*.cuh</exclude>
-					<exclude>Makefile</exclude>
-				</excludes>
-				<targetPath>kernels</targetPath>
+				<directory>src/main/cuda/kernels</directory>
+				<includes>
+					<include>SystemDS.ptx</include>
+					<include>reduction.ptx</include>
+				</includes>
+				<targetPath>cuda/kernels</targetPath>
 			</resource>
 			<resource>
 				<directory>src/main/cpp/lib</directory>
 				<targetPath>lib</targetPath>
 			</resource>
+			<resource>
+				<directory>src/main/cuda/spoof</directory>
+				<targetPath>cuda/spoof</targetPath>
+			</resource>
+			<resource>
+				<directory>src/main/cuda/headers</directory>
+				<includes>
+					<include>agg_ops.cuh</include>
+					<include>reduction.cuh</include>
+					<include>spoof_utils.cuh</include>
+					<include>utils.cuh</include>
+				</includes>
+				<targetPath>cuda/headers</targetPath>
+			</resource>
 		</resources>
 
 		<plugins>
@@ -531,6 +544,7 @@
 						<configuration>
 							<excludes>
 								<exclude>.gitignore</exclude>
+								<exclude>.gitmodules</exclude>
 								<exclude>.repository/</exclude>
 								<exclude>.idea/</exclude>
 								<exclude>.git</exclude>
@@ -567,7 +581,8 @@
 								<exclude>src/main/java/*.tokens</exclude>
 								<exclude>**/*.interp</exclude>
 								<!-- Compiled ptx file from nvcc -->
-								<exclude>src/main/cpp/kernels/SystemDS.ptx</exclude>
+								<exclude>src/main/cuda/kernels/SystemDS.ptx</exclude>
+								<exclude>src/main/cuda/kernels/reduction.ptx</exclude>
 								<!-- Test Validation files -->
 								<exclude>src/test/scripts/functions/jmlc/**/*.impute</exclude>
 								<exclude>src/test/scripts/functions/jmlc/**/*.map</exclude>
@@ -586,6 +601,8 @@
 								<exclude>src/main/python/tests/lt*.txt</exclude>
 								<!-- Perftest requirement file -->
 								<exclude>scripts/perftest/python/requirements.txt</exclude>
+								<!-- external sources -->
+								<exclude>src/main/cuda/ext/**</exclude>
 							</excludes>
 						</configuration>
 					</plugin>
diff --git a/src/main/cpp/kernels/Makefile b/src/main/cpp/kernels/Makefile
deleted file mode 100644
index 8766114..0000000
--- a/src/main/cpp/kernels/Makefile
+++ /dev/null
@@ -1,40 +0,0 @@
-#-------------------------------------------------------------
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#-------------------------------------------------------------
-
-NVCC=nvcc
-GCC=gcc-8
-
-# set the CUDA-supported version of gcc through -ccbin
-CUDAFLAGS= -ptx -c -arch=sm_30 --std c++11 -ccbin $(GCC)
-
-# Flags for compilation on recent Ubuntu + P100
-#CUDAFLAGS= -ptx -c --std c++11 -ccbin gcc-8 -m64 -gencode arch=compute_60,code=sm_60
-
-# Use these flags for precise math
-#CUDAFLAGS= -ptx -c -arch=sm_30 -ftz=false -prec-div=true -prec-sqrt=true
-
-
-SystemDS.o: SystemDS.cu
-	$(NVCC) $(CUDAFLAGS)  SystemDS.cu
-
-all: SystemDS.o
-
-
-clean:
-	rm -rf SystemDS.ptx
diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so
new file mode 100644
index 0000000..89f1270
Binary files /dev/null and b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so differ
diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll b/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll
new file mode 100644
index 0000000..22fe3d5
Binary files /dev/null and b/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll differ
diff --git a/src/main/cuda/CMakeLists.txt b/src/main/cuda/CMakeLists.txt
new file mode 100644
index 0000000..8b74dee
--- /dev/null
+++ b/src/main/cuda/CMakeLists.txt
@@ -0,0 +1,111 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+
+# default to gcc 8.x while we're still supporting CUDA 10.x only
+if (UNIX)
+    set(CMAKE_CUDA_HOST_COMPILER g++-8 CACHE INTERNAL "")
+    set(CMAKE_CUDA_COMPILER nvcc CACHE INTERNAL "")
+    set(CMAKE_CXX_COMPILER g++ CACHE INTERNAL "")
+endif()
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_WITH_INSTALL_RPATH True CACHE INTERNAL "")
+
+project(SystemDS LANGUAGES CXX CUDA)
+
+set(SYSDS_HEADERS 
+	headers/agg_ops.cuh
+	headers/cum_max.cuh  
+	headers/cum_min.cuh  
+	headers/cum_prod.cuh  
+	headers/cum_scan.cuh  
+	headers/cum_sum.cuh  
+	headers/cum_sum_prod.cuh  
+	headers/utils.cuh)
+set(SYSDS_SOURCES kernels/SystemDS.cu)
+
+add_library(SystemDS OBJECT ${SYSDS_HEADERS} ${SYSDS_SOURCES})
+target_include_directories(SystemDS PUBLIC "${CMAKE_SOURCE_DIR}/headers")
+
+find_package(CUDAToolkit REQUIRED)
+cmake_policy(SET CMP0104 NEW)
+set(CMAKE_CUDA_ARCHITECTURES  OFF)
+#ToDo: more compiler flag settings for Debug/Release compilation
+set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr")
+
+set_property(TARGET SystemDS PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
+set_property(TARGET SystemDS PROPERTY CUDA_PTX_COMPILATION ON)
+
+# sets the installation path to src/main/cuda
+if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}" CACHE PATH "sets the installation path to src/main/cpp/lib" FORCE)
+endif()
+
+install(FILES $<TARGET_OBJECTS:SystemDS> DESTINATION kernels)
+
+#-------------------------------------------------------------
+#project (spoof_cuda LANGUAGES CXX CUDA)
+
+add_library(reduction OBJECT kernels/reduction.cu headers/reduction.cuh)
+target_include_directories(reduction PUBLIC "${CMAKE_SOURCE_DIR}/headers")
+set_property(TARGET reduction PROPERTY CUDA_PTX_COMPILATION ON)
+install(FILES $<TARGET_OBJECTS:reduction> DESTINATION kernels)
+
+include_directories($ENV{JAVA_HOME}/include/)
+
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
+    include_directories($ENV{JAVA_HOME}/include/darwin)
+endif()
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+    include_directories($ENV{JAVA_HOME}/include/linux)
+endif()
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
+    include_directories($ENV{JAVA_HOME}/include/win32)
+endif()
+
+set(SPOOF_HEADERS 
+	spoof-launcher/jni_bridge.h
+	spoof-launcher/SpoofCUDAContext.h)
+set(SPOOF_SOURCES 
+	spoof-launcher/jni_bridge.cpp
+	spoof-launcher/SpoofCUDAContext.cpp)
+
+add_library(spoof_cuda SHARED ${SPOOF_HEADERS} ${SPOOF_SOURCES} )
+
+target_include_directories(spoof_cuda PRIVATE "${CMAKE_SOURCE_DIR}/ext/jitify")
+target_link_libraries(spoof_cuda CUDA::nvrtc CUDA::cuda_driver CUDA::cudart)
+target_compile_features(spoof_cuda PUBLIC cxx_std_11)
+set_target_properties(spoof_cuda PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(spoof_cuda PROPERTIES OUTPUT_NAME "systemds_spoof_cuda-${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
+
+# unify naming convention to libsystemds_...
+if (WIN32)    
+    set(CMAKE_IMPORT_LIBRARY_PREFIX lib CACHE INTERNAL "")
+    set(CMAKE_SHARED_LIBRARY_PREFIX lib CACHE INTERNAL "")
+    target_link_libraries(spoof_cuda DbgHelp.lib)
+    install(TARGETS spoof_cuda RUNTIME DESTINATION ../cpp/lib)
+endif()
+
+if(UNIX)
+    install(TARGETS spoof_cuda LIBRARY DESTINATION ../cpp/lib)
+endif()
diff --git a/src/main/cuda/ext/jitify b/src/main/cuda/ext/jitify
new file mode 160000
index 0000000..3e96bcc
--- /dev/null
+++ b/src/main/cuda/ext/jitify
@@ -0,0 +1 @@
+Subproject commit 3e96bcceb9e42105f6a32315abb2af04585a55b0
diff --git a/src/main/cpp/kernels/agg_ops.cuh b/src/main/cuda/headers/agg_ops.cuh
similarity index 76%
rename from src/main/cpp/kernels/agg_ops.cuh
rename to src/main/cuda/headers/agg_ops.cuh
index c53ce04..abe9a2b 100644
--- a/src/main/cpp/kernels/agg_ops.cuh
+++ b/src/main/cuda/headers/agg_ops.cuh
@@ -17,19 +17,19 @@
  * under the License.
  */
 
-#ifndef __AGG_OPS_H
-#define __AGG_OPS_H
-
 #pragma once
+#ifndef AGG_OPS_H
+#define AGG_OPS_H
 
 #include <cuda_runtime.h>
+#include <math_constants.h>
 
 /**
  * Functor op for assignment op. This is a dummy/identity op.
  */
 template<typename T>
 struct IdentityOp {
-	__device__  __forceinline__ T operator()(T a) const {
+	__device__  __forceinline__ T operator()(T a, int idx = 0) const {
 		return a;
 	}
 };
@@ -45,15 +45,39 @@ struct SumOp {
 };
 
 /**
+ * Functor op for sum of squares operation (returns a + b * b)
+ */
+template<typename T>
+struct SumSqOp {
+	__device__  __forceinline__ T operator()(T a, T b) const {
+		return a + b * b;
+	}
+};
+
+/**
  * Functor op for min operation
  */
 template<typename T>
 struct MinOp {
 	__device__  __forceinline__ T operator()(T a, T b) const {
+		return a < b ? a : b;
+	}
+};
+
+template<>
+struct MinOp<double> {
+	__device__  __forceinline__  double operator()(double a, double b) const {
 		return fmin(a, b);
 	}
 };
 
+template<>
+struct MinOp<float> {
+	__device__  __forceinline__ float operator()(float a, float b) const {
+		return fminf(a, b);
+	}
+};
+
 /**
  * Functor op for max operation
  */
@@ -124,10 +148,10 @@ struct MinNeutralElement {
 };
 
 template<>
-float MinNeutralElement<float>::get() { return INFINITY; }
+float MinNeutralElement<float>::get() { return CUDART_INF_F; }
 
 template<>
-double MinNeutralElement<double>::get() { return INFINITY; }
+double MinNeutralElement<double>::get() { return CUDART_INF; }
 
 template<typename T>
 struct MaxNeutralElement {
@@ -135,9 +159,9 @@ struct MaxNeutralElement {
 };
 
 template<>
-float MaxNeutralElement<float>::get() { return -INFINITY; }
+float MaxNeutralElement<float>::get() { return -CUDART_INF_F; }
 
 template<>
-double MaxNeutralElement<double>::get() { return -INFINITY; }
+double MaxNeutralElement<double>::get() { return -CUDART_INF_F; }
 
 #endif // __AGG_OPS_H
diff --git a/src/main/cpp/kernels/cum_max.cuh b/src/main/cuda/headers/cum_max.cuh
similarity index 96%
rename from src/main/cpp/kernels/cum_max.cuh
rename to src/main/cuda/headers/cum_max.cuh
index 2571716..03cbb30 100644
--- a/src/main/cpp/kernels/cum_max.cuh
+++ b/src/main/cuda/headers/cum_max.cuh
@@ -17,10 +17,11 @@
  * under the License.
  */
 
-#ifndef __CUM_MAX_H
-#define __CUM_MAX_H
-
 #pragma once
+#ifndef CUM_MAX_H
+#define CUM_MAX_H
+
+using uint = unsigned int;
 #include <cuda_runtime.h>
 
 /**
@@ -75,4 +76,4 @@ extern "C" __global__ void cumulative_max_down_sweep_f(float *g_idata, float *g_
 	cumulative_scan_down_sweep<MaxOp<float>, MaxNeutralElement<float>, float>(g_idata, g_odata, g_tdata, rows, cols, block_height, op);
 }
 
-#endif // __CUM_MAX_H
+#endif // CUM_MAX_H
diff --git a/src/main/cpp/kernels/cum_min.cuh b/src/main/cuda/headers/cum_min.cuh
similarity index 97%
rename from src/main/cpp/kernels/cum_min.cuh
rename to src/main/cuda/headers/cum_min.cuh
index 5ebe659..3e653ba 100644
--- a/src/main/cpp/kernels/cum_min.cuh
+++ b/src/main/cuda/headers/cum_min.cuh
@@ -17,10 +17,11 @@
  * under the License.
  */
 
-#ifndef __CUM_MIN_H
-#define __CUM_MIN_H
-
 #pragma once
+#ifndef CUM_MIN_H
+#define CUM_MIN_H
+
+using uint = unsigned int;
 #include <cuda_runtime.h>
 
 /**
diff --git a/src/main/cpp/kernels/cum_prod.cuh b/src/main/cuda/headers/cum_prod.cuh
similarity index 97%
rename from src/main/cpp/kernels/cum_prod.cuh
rename to src/main/cuda/headers/cum_prod.cuh
index f294fc2..f6fc2fe 100644
--- a/src/main/cpp/kernels/cum_prod.cuh
+++ b/src/main/cuda/headers/cum_prod.cuh
@@ -17,10 +17,9 @@
  * under the License.
  */
 
-#ifndef __CUM_PROD_H
-#define __CUM_PROD_H
-
 #pragma once
+#ifndef CUM_PROD_H
+#define CUM_PROD_H
 
 using uint = unsigned int;
 #include <cuda_runtime.h>
@@ -77,4 +76,4 @@ extern "C" __global__ void cumulative_prod_down_sweep_f(float *g_idata, float *g
 	cumulative_scan_down_sweep<ProductOp<float>, ProdNeutralElement<float>, float>(g_idata, g_odata, g_tdata, rows, cols, block_height, op);
 }
 
-#endif // __CUM_PROD_H
+#endif // CUM_PROD_H
diff --git a/src/main/cpp/kernels/cum_scan.cuh b/src/main/cuda/headers/cum_scan.cuh
similarity index 96%
rename from src/main/cpp/kernels/cum_scan.cuh
rename to src/main/cuda/headers/cum_scan.cuh
index e73488d..67e53d6 100644
--- a/src/main/cpp/kernels/cum_scan.cuh
+++ b/src/main/cuda/headers/cum_scan.cuh
@@ -17,10 +17,12 @@
  * under the License.
  */
 
-#ifndef __CUM_SCAN_H
-#define __CUM_SCAN_H
-
 #pragma once
+#ifndef CUM_SCAN_H
+#define CUM_SCAN_H
+
+using uint = unsigned int;
+#include <cuda_runtime.h>
 
 /**
  * Cumulative Scan - Applies <scanOp> to accumulate values over columns of an input matrix.
@@ -84,4 +86,4 @@ __device__ void cumulative_scan_down_sweep(T *g_idata, T *g_odata, T *g_tdata, u
 		g_odata[i] = acc = scan_op(acc, g_idata[i]);
 }
 
-#endif // __CUM_SCAN_H
+#endif // CUM_SCAN_H
diff --git a/src/main/cpp/kernels/cum_sum.cuh b/src/main/cuda/headers/cum_sum.cuh
similarity index 97%
rename from src/main/cpp/kernels/cum_sum.cuh
rename to src/main/cuda/headers/cum_sum.cuh
index c142d57..5325138 100644
--- a/src/main/cpp/kernels/cum_sum.cuh
+++ b/src/main/cuda/headers/cum_sum.cuh
@@ -17,10 +17,9 @@
  * under the License.
  */
 
-#ifndef __CUM_SUM_H
-#define __CUM_SUM_H
-
 #pragma once
+#ifndef CUM_SUM_H
+#define CUM_SUM_H
 
 using uint = unsigned int;
 #include <cuda_runtime.h>
@@ -75,4 +74,4 @@ extern "C" __global__ void cumulative_sum_down_sweep_f(float *g_idata, float *g_
 	cumulative_scan_down_sweep<SumOp<float>, SumNeutralElement<float>, float>(g_idata, g_odata, g_tdata, rows, cols, block_height, op);
 }
 
-#endif // __CUM_SUM_H
+#endif // CUM_SUM_H
diff --git a/src/main/cpp/kernels/cum_sum_prod.cuh b/src/main/cuda/headers/cum_sum_prod.cuh
similarity index 98%
rename from src/main/cpp/kernels/cum_sum_prod.cuh
rename to src/main/cuda/headers/cum_sum_prod.cuh
index 969ed30..be228cd 100644
--- a/src/main/cpp/kernels/cum_sum_prod.cuh
+++ b/src/main/cuda/headers/cum_sum_prod.cuh
@@ -17,11 +17,11 @@
  * under the License.
  */
 
-#ifndef __CUM_SUM_PROD_H
-#define __CUM_SUM_PROD_H
-
 #pragma once
+#ifndef CUM_SUM_PROD_H
+#define CUM_SUM_PROD_H
 
+using uint = unsigned int;
 #include <cuda_runtime.h>
 
 /**
@@ -145,4 +145,4 @@ extern "C" __global__ void cumulative_sum_prod_f(float *g_idata, float *g_odata,
 	cumulative_sum_prod<float, float2Accessor>(g_idata, g_odata, g_tiData, g_toData, rows, block_height, offset);
 }
 
-#endif // __CUM_SUM_PROD_H
+#endif // CUM_SUM_PROD_H
diff --git a/src/main/cuda/headers/reduction.cuh b/src/main/cuda/headers/reduction.cuh
new file mode 100644
index 0000000..56845b5
--- /dev/null
+++ b/src/main/cuda/headers/reduction.cuh
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+#ifndef REDUCTION_CUH
+#define REDUCTION_CUH
+
+using uint = unsigned int;
+#include <cuda_runtime.h>
+
+#include "utils.cuh"
+
+/**
+ * Does a reduce operation over all elements of the array.
+ * This method has been adapted from the Reduction sample in the NVIDIA CUDA
+ * Samples (v8.0)
+ * and the Reduction example available through jcuda.org
+ * When invoked initially, all blocks partly compute the reduction operation
+ * over the entire array
+ * and writes it to the output/temporary array. A second invokation needs to
+ * happen to get the
+ * reduced value.
+ * The number of threads, blocks and amount of shared memory is calculated in a
+ * specific way.
+ * Please refer to the NVIDIA CUDA Sample or the SystemDS code that invokes this
+ * method to see
+ * how its done.
+ * The template-ized version of this function is similar to what is found in
+ * NVIDIA CUB
+ *
+ * @param n		size of the input and temporary/output arrays		
+ * @param ReductionOp		Type of the functor object that implements the
+ *		reduction operation
+ * @param SpoofCellwiseOp		initial value for the reduction variable
+ */
+template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
+__device__ void FULL_AGG(
+		T *g_idata, ///< input data stored in device memory (of size n)
+		T *g_odata, ///< output/temporary array stored in device memory (of size n)
+		uint m,
+		uint n,
+		T initialValue, 
+		ReductionOp reduction_op, 
+	    SpoofCellwiseOp spoof_op)
+{
+	auto sdata = shared_memory_proxy<T>();
+
+	// perform first level of reduction,
+	// reading from global memory, writing to shared memory
+	uint tid = threadIdx.x;
+	uint i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
+	uint gridSize = blockDim.x * 2 * gridDim.x;
+	uint N = m * n;
+	T v = initialValue;
+
+	// we reduce multiple elements per thread.  The number is determined by the
+	// number of active thread blocks (via gridDim).  More blocks will result
+	// in a larger gridSize and therefore fewer elements per thread
+	while (i < N) {
+		v = reduction_op(v, spoof_op(g_idata[i], i));
+
+		if (i + blockDim.x < N)	
+		{
+			//__syncthreads();
+			//printf("loop fetch i(%d)+blockDim.x(%d)=%d, in=%f\n",i, blockDim.x, i + blockDim.x, g_idata[i + blockDim.x]);
+			v = reduction_op(v, spoof_op(g_idata[i + blockDim.x], blockDim.x + i));
+		}
+
+		i += gridSize;
+	}
+
+	// each thread puts its local sum into shared memory
+	sdata[tid] = v;
+	__syncthreads();
+
+	// do reduction in shared mem
+	if (blockDim.x >= 1024) {
+		if (tid < 512) {
+			sdata[tid] = v = reduction_op(v, sdata[tid + 512]);
+		}
+		__syncthreads();
+	}
+	if (blockDim.x >= 512) {
+		if (tid < 256) {
+			sdata[tid] = v = reduction_op(v, sdata[tid + 256]);
+		}
+		__syncthreads();
+	}
+	if (blockDim.x >= 256) {
+		if (tid < 128) {
+			sdata[tid] = v = reduction_op(v, sdata[tid + 128]);
+		}
+		__syncthreads();
+	}
+	if (blockDim.x >= 128) {
+		if (tid < 64) {
+			sdata[tid] = v = reduction_op(v, sdata[tid + 64]);
+		}
+		__syncthreads();
+	}
+
+	if (tid < 32) {
+		// now that we are using warp-synchronous programming (below)
+		// we need to declare our shared memory volatile so that the compiler
+		// doesn't reorder stores to it and induce incorrect behavior.
+		volatile T *smem = sdata;
+		if (blockDim.x >= 64) {
+			smem[tid] = v = reduction_op(v, smem[tid + 32]);
+		}
+		if (blockDim.x >= 32) {
+			smem[tid] = v = reduction_op(v, smem[tid + 16]);
+		}
+		if (blockDim.x >= 16) {
+			smem[tid] = v = reduction_op(v, smem[tid + 8]);
+		}
+		if (blockDim.x >= 8) {
+			smem[tid] = v = reduction_op(v, smem[tid + 4]);
+		}
+		if (blockDim.x >= 4) {
+			smem[tid] = v = reduction_op(v, smem[tid + 2]);
+		}
+		if (blockDim.x >= 2) {
+			smem[tid] = v = reduction_op(v, smem[tid + 1]);
+		}
+	}
+
+	// write result for this block to global mem
+	if (tid == 0) {
+		if(gridDim.x < 10)
+			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
+		g_odata[blockIdx.x] = sdata[0];
+	}
+}
+
+/**
+ * Does a reduce (sum) over each row of the array.
+ * This kernel must be launched with as many blocks as there are rows.
+ * The intuition for this kernel is that each block does a reduction over a
+ * single row.
+ * The maximum number of blocks that can launched (as of compute capability 3.0)
+ * is 2^31 - 1
+ * This works out fine for SystemDS, since the maximum elements in a Java array
+ * can be 2^31 - c (some small constant)
+ * If the matrix is "fat" and "short", i.e. there are small number of rows and a
+ * large number of columns,
+ * there could be under-utilization of the hardware.
+ * The template-ized version of this function is similar to what is found in
+ * NVIDIA CUB
+ * @param ReductionOp       Type of the functor object that implements the
+ * reduction operation
+ * @param AssignmentOp      Type of the functor object that is used to modify
+ * the value before writing it to its final location in global memory for each
+ * row
+ */
+template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
+__device__ void ROW_AGG(
+		T *g_idata, ///< input data stored in device memory (of size rows*cols)
+		T *g_odata,  ///< output/temporary array store in device memory (of size
+		/// rows*cols)
+		uint rows,  ///< rows in input and temporary/output arrays
+		uint cols,  ///< columns in input and temporary/output arrays
+		T initialValue,  ///< initial value for the reduction variable
+		ReductionOp reduction_op, ///< Reduction operation to perform (functor object)
+		SpoofCellwiseOp spoof_op) ///< Operation to perform before assigning this
+{
+	auto sdata = shared_memory_proxy<T>();
+
+	// one block per row
+	if (blockIdx.x >= rows) {
+		return;
+	}
+
+	uint block = blockIdx.x;
+	uint tid = threadIdx.x;
+	uint i = tid;
+	uint block_offset = block * cols;
+
+	T v = initialValue;
+	while (i < cols) {
+		v = reduction_op(v, spoof_op(g_idata[block_offset + i], i));
+		i += blockDim.x;
+	}
+
+	// each thread puts its local sum into shared memory
+	sdata[tid] = v;
+	__syncthreads();
+
+	// do reduction in shared mem
+	if (blockDim.x >= 1024) {
+		if (tid < 512) {
+			sdata[tid] = v = reduction_op(v, sdata[tid + 512]);
+		}
+		__syncthreads();
+	}
+	if (blockDim.x >= 512) {
+		if (tid < 256) {
+			sdata[tid] = v = reduction_op(v, sdata[tid + 256]);
+		}
+		__syncthreads();
+	}
+	if (blockDim.x >= 256) {
+		if (tid < 128) {
+			sdata[tid] = v = reduction_op(v, sdata[tid + 128]);
+		}
+		__syncthreads();
+	}
+	if (blockDim.x >= 128) {
+		if (tid < 64) {
+			sdata[tid] = v = reduction_op(v, sdata[tid + 64]);
+		}
+		__syncthreads();
+	}
+
+	if (tid < 32) {
+		// now that we are using warp-synchronous programming (below)
+		// we need to declare our shared memory volatile so that the compiler
+		// doesn't reorder stores to it and induce incorrect behavior.
+		volatile T *smem = sdata;
+		if (blockDim.x >= 64) {
+			smem[tid] = v = reduction_op(v, smem[tid + 32]);
+		}
+		if (blockDim.x >= 32) {
+			smem[tid] = v = reduction_op(v, smem[tid + 16]);
+		}
+		if (blockDim.x >= 16) {
+			smem[tid] = v = reduction_op(v, smem[tid + 8]);
+		}
+		if (blockDim.x >= 8) {
+			smem[tid] = v = reduction_op(v, smem[tid + 4]);
+		}
+		if (blockDim.x >= 4) {
+			smem[tid] = v = reduction_op(v, smem[tid + 2]);
+		}
+		if (blockDim.x >= 2) {
+			smem[tid] = v = reduction_op(v, smem[tid + 1]);
+		}
+	}
+
+	// write result for this block to global mem, modify it with assignment op
+	if (tid == 0)
+		g_odata[block] = sdata[0];
+}
+
+/**
+ * Does a column wise reduction.
+ * The intuition is that there are as many global threads as there are columns
+ * Each global thread is responsible for a single element in the output vector
+ * This of course leads to a under-utilization of the GPU resources.
+ * For cases, where the number of columns is small, there can be unused SMs
+ *
+ * The template-ized version of this function is similar to what is found in
+ * NVIDIA CUB
+ * @param ReductionOp       Type of the functor object that implements the
+ * reduction operation
+ * @param AssignmentOp      Type of the functor object that is used to modify
+ * the value before writing it to its final location in global memory for each
+ * column
+ */
+template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
+__device__ void COL_AGG(T *g_idata, ///< input data stored in device memory (of size rows*cols)
+		T *g_odata,  ///< output/temporary array store in device memory (of size rows*cols)
+		uint rows,  ///< rows in input and temporary/output arrays
+		uint cols,  ///< columns in input and temporary/output arrays
+		T initialValue,  ///< initial value for the reduction variable
+		ReductionOp reduction_op, ///< Reduction operation to perform (functor object)
+		SpoofCellwiseOp spoof_op) ///< Operation to perform before aggregation
+		
+{
+	uint global_tid = blockIdx.x * blockDim.x + threadIdx.x;
+	if (global_tid >= cols) {
+		return;
+	}
+
+	uint i = global_tid;
+	uint grid_size = cols;
+	T val = initialValue;
+
+	while (i < rows * cols) {
+		val = reduction_op(val, spoof_op(g_idata[i], i));
+		i += grid_size;
+	}
+	g_odata[global_tid] = val;
+}
+
+template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
+__device__ void NO_AGG(T* g_idata, T* g_odata,  uint rows, uint cols,
+	T VT,  ReductionOp reduction_op, SpoofCellwiseOp spoof_op) 
+{
+	int tid = blockIdx.x * blockDim.x + threadIdx.x;
+	int first_idx = tid * static_cast<int>(VT);
+	int last_idx = min(first_idx + static_cast<int>(VT), spoof_op.m * spoof_op.n);
+	#pragma unroll
+	for(int i = first_idx; i < last_idx; i++) {
+		g_odata[i] = spoof_op(g_idata[i], i);
+	}
+}
+
+#endif // REDUCTION_CUH
diff --git a/src/main/cuda/headers/spoof_utils.cuh b/src/main/cuda/headers/spoof_utils.cuh
new file mode 100644
index 0000000..e28d254
--- /dev/null
+++ b/src/main/cuda/headers/spoof_utils.cuh
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+#ifndef SPOOF_UTILS_CUH
+#define SPOOF_UTILS_CUH
+
+#include <math_constants.h>
+
+__constant__ double DOUBLE_EPS = 1.11022E-16; // 2 ^ -53
+__constant__ double FLOAT_EPS = 1.49012E-08; // 2 ^ -26
+__constant__ double EPSILON = 1E-11; // margin for comparisons ToDo: make consistent use of it
+
+__device__ long long toInt64(double a) {
+    return (signbit(a) == 0 ? 1.0 : -1.0) * abs(floor(a + DOUBLE_EPS));
+}
+
+__device__ int toInt32(float a) {
+    return (signbit(a) == 0 ? 1.0 : -1.0) * abs(floor(a + FLOAT_EPS));
+}
+
+template<typename T>
+__device__ T getValue(T* data, int rowIndex) {
+    return data[rowIndex];
+}
+
+template<typename T>
+__device__ T getValue(T* data, int n, int rowIndex, int colIndex) {
+    return data[rowIndex * n + colIndex];
+}
+
+template<typename T>
+__device__ T intDiv(T a, T b);
+
+template<>
+__device__ double intDiv(double a, double b) {
+    double ret = a / b;
+    return (isnan(ret) || isinf(ret)) ? ret : toInt64(ret);
+}
+
+template<>
+__device__ float intDiv(float a, float b) {
+    float ret = a / b;
+    return (isnan(ret) || isinf(ret)) ? ret : toInt32(ret);
+}
+
+template<typename T>
+__device__ T modulus(T a, T b);
+
+template<>
+__device__ double modulus(double a, double b) {
+    if (fabs(b) < DOUBLE_EPS)
+        return CUDART_NAN;
+    return a - intDiv(a, b) * b;
+}
+
+template<>
+__device__ float modulus(float a, float b) {
+    if (fabs(b) < FLOAT_EPS)
+        return CUDART_NAN_F;
+    return a - intDiv(a, b) * b;
+}
+
+template<typename T>
+__device__ T bwAnd(T a, T b);
+
+// ToDo: does not work with long long
+template<>
+__device__ double bwAnd(double a, double b) {
+	return toInt64(a) & toInt64(b);
+}
+
+template<>
+__device__ float bwAnd(float a, float b) {
+	return toInt32(a) & toInt32(b);
+}
+
+#endif // SPOOF_UTILS_CUH
diff --git a/src/main/cpp/kernels/utils.cuh b/src/main/cuda/headers/utils.cuh
similarity index 87%
rename from src/main/cpp/kernels/utils.cuh
rename to src/main/cuda/headers/utils.cuh
index e4ec01d..420c0d0 100644
--- a/src/main/cpp/kernels/utils.cuh
+++ b/src/main/cuda/headers/utils.cuh
@@ -17,12 +17,25 @@
  * under the License.
  */
 
-#ifndef __UTILS_H
-#define __UTILS_H
-
 #pragma once
+#ifndef UTILS_H
+#define UTILS_H
+
+#include <limits>
 
-#include <cuda_runtime.h>
+// Use this method in templates to fetch the maximum value for a given datatype
+template<typename T>
+__forceinline__ __device__ T MAX() {
+	return T();
+}
+template<>
+__forceinline__ __device__ float MAX<float>() {
+	return std::numeric_limits<float>::max();
+}
+template<>
+__forceinline__ __device__ double MAX<double>() {
+	return std::numeric_limits<double>::max();
+}
 
 /**
  * Solution suggested by [1] to have different types of shared memory
@@ -109,4 +122,4 @@ extern "C" __global__ void float2double_f(float *A, double *ret, int N) {
 	}
 }
 
-#endif // __UTILS_H
+#endif // UTILS_H
diff --git a/src/main/cpp/kernels/SystemDS.cu b/src/main/cuda/kernels/SystemDS.cu
similarity index 99%
rename from src/main/cpp/kernels/SystemDS.cu
rename to src/main/cuda/kernels/SystemDS.cu
index ccf880b..52e2b33 100644
--- a/src/main/cpp/kernels/SystemDS.cu
+++ b/src/main/cuda/kernels/SystemDS.cu
@@ -23,11 +23,8 @@
  nvcc -w -ptx -arch=sm_30 --std c++11 SystemDS.cu
  ***********************************/
 
-#include <cfloat>
-#include <cmath>
 using uint = unsigned int;
 #include <cuda_runtime.h>
-#include <device_launch_parameters.h>
 
 #include "utils.cuh"
 #include "agg_ops.cuh"
@@ -424,20 +421,6 @@ extern "C" __global__ void copy_u2l_dense_f(float *ret, int dim, int N) {
 	copy_u2l_dense(ret, dim, N);
 }
 
-// Use this method in templates to fetch the maximum value for a given datatype
-template<typename T>
-__forceinline__ __device__ T MAX() {
-	return T();
-}
-template<>
-__forceinline__ __device__ float MAX<float>() {
-	return FLT_MAX;
-}
-template<>
-__forceinline__ __device__ double MAX<double>() {
-	return DBL_MAX;
-}
-
 // op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power,
 // 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
 // 11=min, 12=max, 13=and, 14=or, 15=minus1multiply, 16=minusnz,
diff --git a/src/main/cpp/kernels/SystemDS.ptx b/src/main/cuda/kernels/SystemDS.ptx
similarity index 100%
rename from src/main/cpp/kernels/SystemDS.ptx
rename to src/main/cuda/kernels/SystemDS.ptx
diff --git a/src/main/cuda/kernels/reduction.cu b/src/main/cuda/kernels/reduction.cu
new file mode 100644
index 0000000..04fd098
--- /dev/null
+++ b/src/main/cuda/kernels/reduction.cu
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "utils.cuh"
+#include "agg_ops.cuh"
+#include "reduction.cuh"
+
+using uint = unsigned int;
+#include <cuda_runtime.h>
+
+/**
+ * Do a summation over all elements of an array/matrix
+ * @param g_idata   input data stored in device memory (of size n)
+ * @param g_odata   output/temporary array stored in device memory (of size n)
+ * @param n         size of the input and temporary/output arrays
+ */
+template<typename T>
+__device__ void reduce_sum(T *g_idata, T *g_odata, uint n) {
+	SumOp<T> agg_op;	
+	IdentityOp<T> spoof_op;
+	FULL_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, (T) 0.0, agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_sum_d(double *g_idata, double *g_odata, uint n) {
+	reduce_sum(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_sum_f(float *g_idata, float *g_odata, uint n) {
+	reduce_sum(g_idata, g_odata, n);
+}
+
+/**
+ * Do a summation over all rows of a matrix
+ * @param g_idata   input matrix stored in device memory (of size rows * cols)
+ * @param g_odata   output vector stored in device memory (of size rows)
+ * @param rows      number of rows in input matrix
+ * @param cols      number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_row_sum(T *g_idata, T *g_odata, uint rows, uint cols) {
+	SumOp<T> agg_op;
+	IdentityOp<T> spoof_op;
+	ROW_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, 0.0, agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_row_sum_d(double *g_idata, double *g_odata, uint rows, uint cols) {
+	reduce_row_sum(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_sum_f(float *g_idata, float *g_odata, uint rows, uint cols) {
+	reduce_row_sum(g_idata, g_odata, rows, cols);
+}
+
+/**
+ * Do a summation over all columns of a matrix
+ * @param g_idata   input matrix stored in device memory (of size rows * cols)
+ * @param g_odata   output vector stored in device memory (of size cols)
+ * @param rows      number of rows in input matrix
+ * @param cols      number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_col_sum(T *g_idata, T *g_odata, uint rows, uint cols) {
+	SumOp<T> agg_op;
+	IdentityOp<T> spoof_op;
+	COL_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_col_sum_d(double *g_idata, double *g_odata, uint rows, uint cols) {
+	reduce_col_sum(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_sum_f(float *g_idata, float *g_odata, uint rows, uint cols) {
+	reduce_col_sum(g_idata, g_odata, rows, cols);
+}
+
+
+/**
+ * Do a max over all elements of an array/matrix
+ * @param g_idata   input data stored in device memory (of size n)
+ * @param g_odata   output/temporary array stode in device memory (of size n)
+ * @param n         size of the input and temporary/output arrays
+ */
+template<typename T>
+__device__ void reduce_max(T *g_idata, T *g_odata, uint n) {
+	MaxOp<T> agg_op;
+	IdentityOp<T> spoof_op;
+	FULL_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, -MAX<T>(), agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_max_d(double *g_idata, double *g_odata, uint n) {
+	reduce_max(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_max_f(float *g_idata, float *g_odata, uint n) {
+	reduce_max(g_idata, g_odata, n);
+}
+
+/**
+ * Do a max over all rows of a matrix
+ * @param g_idata   input matrix stored in device memory (of size rows * cols)
+ * @param g_odata   output vector stored in device memory (of size rows)
+ * @param rows      number of rows in input matrix
+ * @param cols      number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_row_max(T *g_idata, T *g_odata, uint rows, uint cols) {
+	MaxOp<T> agg_op;
+	IdentityOp<T> spoof_op;
+	ROW_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, -MAX<T>(), agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_row_max_d(double *g_idata, double *g_odata, uint rows, uint cols) {
+	reduce_row_max(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_max_f(float *g_idata, float *g_odata, uint rows, uint cols) {
+	reduce_row_max(g_idata, g_odata, rows, cols);
+}
+
+/**
+ * Do a max over all columns of a matrix
+ * @param g_idata   input matrix stored in device memory (of size rows * cols)
+ * @param g_odata   output vector stored in device memory (of size cols)
+ * @param rows      number of rows in input matrix
+ * @param cols      number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_col_max(T *g_idata, T *g_odata, uint rows, uint cols) {
+	MaxOp<T> agg_op;
+	IdentityOp<T> spoof_op;
+	COL_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, -MAX<T>(), agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_col_max_d(double *g_idata, double *g_odata, uint rows, uint cols) {
+	reduce_col_max(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_max_f(float *g_idata, float *g_odata, uint rows, uint cols) {
+	reduce_col_max(g_idata, g_odata, rows, cols);
+}
+
+
+/**
+ * Do a min over all elements of an array/matrix
+ * @param g_idata   input data stored in device memory (of size n)
+ * @param g_odata   output/temporary array stode in device memory (of size n)
+ * @param n         size of the input and temporary/output arrays
+ */
+template<typename T>
+__device__ void reduce_min(T *g_idata, T *g_odata, uint n) {
+	MinOp<T> agg_op;
+	IdentityOp<T> spoof_op;
+	FULL_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, MAX<T>(), agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_min_d(double *g_idata, double *g_odata, uint n) {
+	reduce_min(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_min_f(float *g_idata, float *g_odata, uint n) {
+	reduce_min(g_idata, g_odata, n);
+}
+
+
+/**
+ * Do a min over all rows of a matrix
+ * @param g_idata   input matrix stored in device memory (of size rows * cols)
+ * @param g_odata   output vector stored in device memory (of size rows)
+ * @param rows      number of rows in input matrix
+ * @param cols      number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_row_min(T *g_idata, T *g_odata, uint rows, uint cols) {
+	MinOp<T> agg_op;
+	IdentityOp<T> spoof_op;
+	ROW_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, MAX<T>(), agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_row_min_d(double *g_idata, double *g_odata, uint rows, uint cols) {
+	reduce_row_min(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_min_f(float *g_idata, float *g_odata, uint rows, uint cols) {
+	reduce_row_min(g_idata, g_odata, rows, cols);
+}
+
+/**
+ * Do a min over all columns of a matrix
+ * @param g_idata   input matrix stored in device memory (of size rows * cols)
+ * @param g_odata   output vector stored in device memory (of size cols)
+ * @param rows      number of rows in input matrix
+ * @param cols      number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_col_min(T *g_idata, T *g_odata, uint rows, uint cols) {
+	MinOp<T> agg_op;
+	IdentityOp<T> spoof_op;
+	COL_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, MAX<T>(), agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_col_min_d(double *g_idata, double *g_odata, uint rows, uint cols) {
+	reduce_col_min(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_min_f(float *g_idata, float *g_odata, uint rows, uint cols) {
+	reduce_col_min(g_idata, g_odata, rows, cols);
+}
+
+
+/**
+ * Do a summation over all squared elements of an array/matrix
+ * @param g_idata   input data stored in device memory (of size n)
+ * @param g_odata   output/temporary array stored in device memory (of size n)
+ * @param n         size of the input and temporary/output arrays
+ */
+template<typename T>
+__device__ void reduce_sum_sq(T *g_idata, T *g_odata, uint n) {
+	SumSqOp<T> agg_op;
+	IdentityOp<T> spoof_op;
+	FULL_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, (T) 0.0, agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_sum_sq_d(double *g_idata, double *g_odata, uint n) {
+	reduce_sum_sq(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_sum_sq_f(float *g_idata, float *g_odata, uint n) {
+	reduce_sum_sq(g_idata, g_odata, n);
+}
+
+/**
+ * Do a summation over all squared elements of an array/matrix
+ * @param g_idata   input data stored in device memory (of size n)
+ * @param g_odata   output/temporary array stored in device memory (of size n)
+ * @param rows      number of rows in input matrix
+ * @param cols      number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_col_sum_sq(T* g_idata, T* g_odata, uint rows, uint cols) {
+	SumSqOp<T> agg_op;
+	IdentityOp<T> spoof_op;
+	COL_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_col_sum_sq_d(double* g_idata, double* g_odata, uint rows, uint cols) {
+	reduce_col_sum_sq(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_sum_sq_f(float* g_idata, float* g_odata, uint rows, uint cols) {
+	reduce_col_sum_sq(g_idata, g_odata, rows, cols);
+}
+
+template<typename T>
+__device__ void reduce_row_sum_sq(T* g_idata, T* g_odata, uint rows, uint cols) {
+	SumSqOp<T> agg_op;
+	IdentityOp<T> spoof_op;
+	ROW_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_row_sum_sq_d(double* g_idata, double* g_odata, uint rows, uint cols) {
+	reduce_row_sum_sq(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_sum_sq_f(float* g_idata, float* g_odata, uint rows, uint cols) {
+	reduce_row_sum_sq(g_idata, g_odata, rows, cols);
+}
diff --git a/src/main/cuda/kernels/reduction.ptx b/src/main/cuda/kernels/reduction.ptx
new file mode 100644
index 0000000..4a30447
--- /dev/null
+++ b/src/main/cuda/kernels/reduction.ptx
@@ -0,0 +1,3546 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-27506705
+// Cuda compilation tools, release 10.2, V10.2.89
+// Based on LLVM 3.4svn
+//
+
+.version 6.5
+.target sm_30
+.address_size 64
+
+	// .globl	double2float_f
+.extern .func  (.param .b32 func_retval0) vprintf
+(
+	.param .b64 vprintf_param_0,
+	.param .b64 vprintf_param_1
+)
+;
+.global .align 1 .b8 $str[39] = {98, 108, 111, 99, 107, 73, 100, 120, 46, 120, 61, 37, 100, 32, 114, 101, 100, 117, 99, 116, 105, 111, 110, 32, 114, 101, 115, 117, 108, 116, 58, 32, 37, 51, 46, 49, 102, 10, 0};
+.extern .shared .align 1 .b8 memory[];
+
+.visible .entry double2float_f(
+	.param .u64 double2float_f_param_0,
+	.param .u64 double2float_f_param_1,
+	.param .u32 double2float_f_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<6>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [double2float_f_param_0];
+	ld.param.u64 	%rd2, [double2float_f_param_1];
+	ld.param.u32 	%r2, [double2float_f_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.s32	%p1, %r1, %r2;
+	@%p1 bra 	BB0_2;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd1, [%rd5];
+	cvt.rn.f32.f64	%f1, %fd1;
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.s32 	%rd7, %r1, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f1;
+
+BB0_2:
+	ret;
+}
+
+	// .globl	float2double_f
+.visible .entry float2double_f(
+	.param .u64 float2double_f_param_0,
+	.param .u64 float2double_f_param_1,
+	.param .u32 float2double_f_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<6>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [float2double_f_param_0];
+	ld.param.u64 	%rd2, [float2double_f_param_1];
+	ld.param.u32 	%r2, [float2double_f_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.s32	%p1, %r1, %r2;
+	@%p1 bra 	BB1_2;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f1, [%rd5];
+	cvt.f64.f32	%fd1, %f1;
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.s32 	%rd7, %r1, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f64 	[%rd8], %fd1;
+
+BB1_2:
+	ret;
+}
+
+	// .globl	reduce_sum_d
+.visible .entry reduce_sum_d(
+	.param .u64 reduce_sum_d_param_0,
+	.param .u64 reduce_sum_d_param_1,
+	.param .u32 reduce_sum_d_param_2
+)
+{
+	.local .align 16 .b8 	__local_depot2[16];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<21>;
+	.reg .b32 	%r<39>;
+	.reg .f64 	%fd<61>;
+	.reg .b64 	%rd<16>;
+
+
+	mov.u64 	%SPL, __local_depot2;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd1, [reduce_sum_d_param_0];
+	ld.param.u64 	%rd2, [reduce_sum_d_param_1];
+	ld.param.u32 	%r6, [reduce_sum_d_param_2];
+	mov.u32 	%r7, %ctaid.x;
+	shl.b32 	%r8, %r7, 1;
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %tid.x;
+	mad.lo.s32 	%r38, %r8, %r9, %r10;
+	mov.f64 	%fd45, 0d0000000000000000;
+	setp.ge.u32	%p1, %r38, %r6;
+	@%p1 bra 	BB2_4;
+
+BB2_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.u32 	%rd4, %r38, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd30, [%rd5];
+	add.f64 	%fd45, %fd45, %fd30;
+	add.s32 	%r3, %r38, %r9;
+	setp.ge.u32	%p2, %r3, %r6;
+	@%p2 bra 	BB2_3;
+
+	mul.wide.u32 	%rd7, %r3, 8;
+	add.s64 	%rd8, %rd3, %rd7;
+	ld.global.f64 	%fd31, [%rd8];
+	add.f64 	%fd45, %fd45, %fd31;
+
+BB2_3:
+	shl.b32 	%r13, %r9, 1;
+	mov.u32 	%r14, %nctaid.x;
+	mad.lo.s32 	%r38, %r13, %r14, %r38;
+	setp.lt.u32	%p3, %r38, %r6;
+	@%p3 bra 	BB2_1;
+
+BB2_4:
+	shl.b32 	%r16, %r10, 3;
+	mov.u32 	%r17, memory;
+	add.s32 	%r5, %r17, %r16;
+	st.shared.f64 	[%r5], %fd45;
+	bar.sync 	0;
+	setp.lt.u32	%p4, %r9, 1024;
+	@%p4 bra 	BB2_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB2_7;
+
+	ld.shared.f64 	%fd32, [%r5+4096];
+	add.f64 	%fd45, %fd45, %fd32;
+	st.shared.f64 	[%r5], %fd45;
+
+BB2_7:
+	bar.sync 	0;
+
+BB2_8:
+	setp.lt.u32	%p6, %r9, 512;
+	@%p6 bra 	BB2_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB2_11;
+
+	ld.shared.f64 	%fd33, [%r5+2048];
+	add.f64 	%fd45, %fd45, %fd33;
+	st.shared.f64 	[%r5], %fd45;
+
+BB2_11:
+	bar.sync 	0;
+
+BB2_12:
+	setp.lt.u32	%p8, %r9, 256;
+	@%p8 bra 	BB2_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB2_15;
+
+	ld.shared.f64 	%fd34, [%r5+1024];
+	add.f64 	%fd45, %fd45, %fd34;
+	st.shared.f64 	[%r5], %fd45;
+
+BB2_15:
+	bar.sync 	0;
+
+BB2_16:
+	setp.lt.u32	%p10, %r9, 128;
+	@%p10 bra 	BB2_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB2_19;
+
+	ld.shared.f64 	%fd35, [%r5+512];
+	add.f64 	%fd45, %fd45, %fd35;
+	st.shared.f64 	[%r5], %fd45;
+
+BB2_19:
+	bar.sync 	0;
+
+BB2_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB2_33;
+
+	setp.lt.u32	%p13, %r9, 64;
+	@%p13 bra 	BB2_23;
+
+	ld.volatile.shared.f64 	%fd36, [%r5+256];
+	add.f64 	%fd45, %fd45, %fd36;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB2_23:
+	setp.lt.u32	%p14, %r9, 32;
+	@%p14 bra 	BB2_25;
+
+	ld.volatile.shared.f64 	%fd37, [%r5+128];
+	add.f64 	%fd45, %fd45, %fd37;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB2_25:
+	setp.lt.u32	%p15, %r9, 16;
+	@%p15 bra 	BB2_27;
+
+	ld.volatile.shared.f64 	%fd38, [%r5+64];
+	add.f64 	%fd45, %fd45, %fd38;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB2_27:
+	setp.lt.u32	%p16, %r9, 8;
+	@%p16 bra 	BB2_29;
+
+	ld.volatile.shared.f64 	%fd39, [%r5+32];
+	add.f64 	%fd45, %fd45, %fd39;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB2_29:
+	setp.lt.u32	%p17, %r9, 4;
+	@%p17 bra 	BB2_31;
+
+	ld.volatile.shared.f64 	%fd40, [%r5+16];
+	add.f64 	%fd45, %fd45, %fd40;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB2_31:
+	setp.lt.u32	%p18, %r9, 2;
+	@%p18 bra 	BB2_33;
+
+	ld.volatile.shared.f64 	%fd41, [%r5+8];
+	add.f64 	%fd42, %fd45, %fd41;
+	st.volatile.shared.f64 	[%r5], %fd42;
+
+BB2_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB2_37;
+
+	mov.u32 	%r34, %nctaid.x;
+	setp.gt.u32	%p20, %r34, 9;
+	@%p20 bra 	BB2_36;
+
+	ld.shared.f64 	%fd43, [memory];
+	add.u64 	%rd9, %SP, 0;
+	add.u64 	%rd10, %SPL, 0;
+	st.local.u32 	[%rd10], %r7;
+	st.local.f64 	[%rd10+8], %fd43;
+	mov.u64 	%rd11, $str;
+	cvta.global.u64 	%rd12, %rd11;
+	// Callseq Start 0
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd12;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd9;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r36, [retval0+0];
+	
+	//{
+	}// Callseq End 0
+
+BB2_36:
+	ld.shared.f64 	%fd44, [memory];
+	cvta.to.global.u64 	%rd13, %rd2;
+	mul.wide.u32 	%rd14, %r7, 8;
+	add.s64 	%rd15, %rd13, %rd14;
+	st.global.f64 	[%rd15], %fd44;
+
+BB2_37:
+	ret;
+}
+
+	// .globl	reduce_sum_f
+.visible .entry reduce_sum_f(
+	.param .u64 reduce_sum_f_param_0,
+	.param .u64 reduce_sum_f_param_1,
+	.param .u32 reduce_sum_f_param_2
+)
+{
+	.local .align 16 .b8 	__local_depot3[16];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<21>;
+	.reg .f32 	%f<61>;
+	.reg .b32 	%r<39>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<16>;
+
+
+	mov.u64 	%SPL, __local_depot3;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd1, [reduce_sum_f_param_0];
+	ld.param.u64 	%rd2, [reduce_sum_f_param_1];
+	ld.param.u32 	%r6, [reduce_sum_f_param_2];
+	mov.u32 	%r7, %ctaid.x;
+	shl.b32 	%r8, %r7, 1;
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %tid.x;
+	mad.lo.s32 	%r38, %r8, %r9, %r10;
+	mov.f32 	%f45, 0f00000000;
+	setp.ge.u32	%p1, %r38, %r6;
+	@%p1 bra 	BB3_4;
+
+BB3_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.u32 	%rd4, %r38, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f30, [%rd5];
+	add.f32 	%f45, %f45, %f30;
+	add.s32 	%r3, %r38, %r9;
+	setp.ge.u32	%p2, %r3, %r6;
+	@%p2 bra 	BB3_3;
+
+	mul.wide.u32 	%rd7, %r3, 4;
+	add.s64 	%rd8, %rd3, %rd7;
+	ld.global.f32 	%f31, [%rd8];
+	add.f32 	%f45, %f45, %f31;
+
+BB3_3:
+	shl.b32 	%r13, %r9, 1;
+	mov.u32 	%r14, %nctaid.x;
+	mad.lo.s32 	%r38, %r13, %r14, %r38;
+	setp.lt.u32	%p3, %r38, %r6;
+	@%p3 bra 	BB3_1;
+
+BB3_4:
+	shl.b32 	%r16, %r10, 2;
+	mov.u32 	%r17, memory;
+	add.s32 	%r5, %r17, %r16;
+	st.shared.f32 	[%r5], %f45;
+	bar.sync 	0;
+	setp.lt.u32	%p4, %r9, 1024;
+	@%p4 bra 	BB3_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB3_7;
+
+	ld.shared.f32 	%f32, [%r5+2048];
+	add.f32 	%f45, %f45, %f32;
+	st.shared.f32 	[%r5], %f45;
+
+BB3_7:
+	bar.sync 	0;
+
+BB3_8:
+	setp.lt.u32	%p6, %r9, 512;
+	@%p6 bra 	BB3_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB3_11;
+
+	ld.shared.f32 	%f33, [%r5+1024];
+	add.f32 	%f45, %f45, %f33;
+	st.shared.f32 	[%r5], %f45;
+
+BB3_11:
+	bar.sync 	0;
+
+BB3_12:
+	setp.lt.u32	%p8, %r9, 256;
+	@%p8 bra 	BB3_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB3_15;
+
+	ld.shared.f32 	%f34, [%r5+512];
+	add.f32 	%f45, %f45, %f34;
+	st.shared.f32 	[%r5], %f45;
+
+BB3_15:
+	bar.sync 	0;
+
+BB3_16:
+	setp.lt.u32	%p10, %r9, 128;
+	@%p10 bra 	BB3_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB3_19;
+
+	ld.shared.f32 	%f35, [%r5+256];
+	add.f32 	%f45, %f45, %f35;
+	st.shared.f32 	[%r5], %f45;
+
+BB3_19:
+	bar.sync 	0;
+
+BB3_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB3_33;
+
+	setp.lt.u32	%p13, %r9, 64;
+	@%p13 bra 	BB3_23;
+
+	ld.volatile.shared.f32 	%f36, [%r5+128];
+	add.f32 	%f45, %f45, %f36;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB3_23:
+	setp.lt.u32	%p14, %r9, 32;
+	@%p14 bra 	BB3_25;
+
+	ld.volatile.shared.f32 	%f37, [%r5+64];
+	add.f32 	%f45, %f45, %f37;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB3_25:
+	setp.lt.u32	%p15, %r9, 16;
+	@%p15 bra 	BB3_27;
+
+	ld.volatile.shared.f32 	%f38, [%r5+32];
+	add.f32 	%f45, %f45, %f38;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB3_27:
+	setp.lt.u32	%p16, %r9, 8;
+	@%p16 bra 	BB3_29;
+
+	ld.volatile.shared.f32 	%f39, [%r5+16];
+	add.f32 	%f45, %f45, %f39;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB3_29:
+	setp.lt.u32	%p17, %r9, 4;
+	@%p17 bra 	BB3_31;
+
+	ld.volatile.shared.f32 	%f40, [%r5+8];
+	add.f32 	%f45, %f45, %f40;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB3_31:
+	setp.lt.u32	%p18, %r9, 2;
+	@%p18 bra 	BB3_33;
+
+	ld.volatile.shared.f32 	%f41, [%r5+4];
+	add.f32 	%f42, %f45, %f41;
+	st.volatile.shared.f32 	[%r5], %f42;
+
+BB3_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB3_37;
+
+	mov.u32 	%r34, %nctaid.x;
+	setp.gt.u32	%p20, %r34, 9;
+	@%p20 bra 	BB3_36;
+
+	ld.shared.f32 	%f43, [memory];
+	cvt.f64.f32	%fd1, %f43;
+	add.u64 	%rd9, %SP, 0;
+	add.u64 	%rd10, %SPL, 0;
+	st.local.u32 	[%rd10], %r7;
+	st.local.f64 	[%rd10+8], %fd1;
+	mov.u64 	%rd11, $str;
+	cvta.global.u64 	%rd12, %rd11;
+	// Callseq Start 1
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd12;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd9;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r36, [retval0+0];
+	
+	//{
+	}// Callseq End 1
+
+BB3_36:
+	ld.shared.f32 	%f44, [memory];
+	cvta.to.global.u64 	%rd13, %rd2;
+	mul.wide.u32 	%rd14, %r7, 4;
+	add.s64 	%rd15, %rd13, %rd14;
+	st.global.f32 	[%rd15], %f44;
+
+BB3_37:
+	ret;
+}
+
+	// .globl	reduce_row_sum_d
+.visible .entry reduce_row_sum_d(
+	.param .u64 reduce_row_sum_d_param_0,
+	.param .u64 reduce_row_sum_d_param_1,
+	.param .u32 reduce_row_sum_d_param_2,
+	.param .u32 reduce_row_sum_d_param_3
+)
+{
+	.reg .pred 	%p<20>;
+	.reg .b32 	%r<72>;
+	.reg .f64 	%fd<56>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [reduce_row_sum_d_param_0];
+	ld.param.u64 	%rd2, [reduce_row_sum_d_param_1];
+	ld.param.u32 	%r5, [reduce_row_sum_d_param_2];
+	ld.param.u32 	%r4, [reduce_row_sum_d_param_3];
+	mov.u32 	%r6, %ctaid.x;
+	setp.ge.u32	%p1, %r6, %r5;
+	@%p1 bra 	BB4_35;
+
+	mov.u32 	%r71, %tid.x;
+	mov.f64 	%fd6, 0d0000000000000000;
+	setp.ge.u32	%p2, %r71, %r4;
+	@%p2 bra 	BB4_4;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+
+BB4_3:
+	mad.lo.s32 	%r8, %r6, %r4, %r71;
+	mul.wide.u32 	%rd4, %r8, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd28, [%rd5];
+	add.f64 	%fd6, %fd6, %fd28;
+	mov.u32 	%r9, %ntid.x;
+	add.s32 	%r71, %r9, %r71;
+	setp.lt.u32	%p3, %r71, %r4;
+	@%p3 bra 	BB4_3;
+
+BB4_4:
+	mov.u32 	%r10, %tid.x;
+	shl.b32 	%r11, %r10, 3;
+	mov.u32 	%r12, memory;
+	add.s32 	%r13, %r12, %r11;
+	st.shared.f64 	[%r13], %fd6;
+	bar.sync 	0;
+	mov.u32 	%r14, %ntid.x;
+	setp.lt.u32	%p4, %r14, 1024;
+	@%p4 bra 	BB4_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB4_7;
+
+	ld.shared.f64 	%fd29, [%r13+4096];
+	add.f64 	%fd6, %fd6, %fd29;
+	st.shared.f64 	[%r13], %fd6;
+
+BB4_7:
+	bar.sync 	0;
+
+BB4_8:
+	setp.lt.u32	%p6, %r14, 512;
+	@%p6 bra 	BB4_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB4_11;
+
+	ld.shared.f64 	%fd30, [%r13+2048];
+	add.f64 	%fd6, %fd6, %fd30;
+	st.shared.f64 	[%r13], %fd6;
+
+BB4_11:
+	bar.sync 	0;
+
+BB4_12:
+	setp.lt.u32	%p8, %r14, 256;
+	@%p8 bra 	BB4_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB4_15;
+
+	ld.shared.f64 	%fd31, [%r13+1024];
+	add.f64 	%fd6, %fd6, %fd31;
+	st.shared.f64 	[%r13], %fd6;
+
+BB4_15:
+	bar.sync 	0;
+
+BB4_16:
+	setp.lt.u32	%p10, %r14, 128;
+	@%p10 bra 	BB4_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB4_19;
+
+	ld.shared.f64 	%fd32, [%r13+512];
+	add.f64 	%fd6, %fd6, %fd32;
+	st.shared.f64 	[%r13], %fd6;
+
+BB4_19:
+	bar.sync 	0;
+
+BB4_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB4_33;
+
+	setp.lt.u32	%p13, %r14, 64;
+	@%p13 bra 	BB4_23;
+
+	ld.volatile.shared.f64 	%fd33, [%r13+256];
+	add.f64 	%fd6, %fd6, %fd33;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB4_23:
+	setp.lt.u32	%p14, %r14, 32;
+	@%p14 bra 	BB4_25;
+
+	ld.volatile.shared.f64 	%fd34, [%r13+128];
+	add.f64 	%fd6, %fd6, %fd34;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB4_25:
+	setp.lt.u32	%p15, %r14, 16;
+	@%p15 bra 	BB4_27;
+
+	ld.volatile.shared.f64 	%fd35, [%r13+64];
+	add.f64 	%fd6, %fd6, %fd35;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB4_27:
+	setp.lt.u32	%p16, %r14, 8;
+	@%p16 bra 	BB4_29;
+
+	ld.volatile.shared.f64 	%fd36, [%r13+32];
+	add.f64 	%fd6, %fd6, %fd36;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB4_29:
+	setp.lt.u32	%p17, %r14, 4;
+	@%p17 bra 	BB4_31;
+
+	ld.volatile.shared.f64 	%fd37, [%r13+16];
+	add.f64 	%fd6, %fd6, %fd37;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB4_31:
+	setp.lt.u32	%p18, %r14, 2;
+	@%p18 bra 	BB4_33;
+
+	ld.volatile.shared.f64 	%fd38, [%r13+8];
+	add.f64 	%fd39, %fd6, %fd38;
+	st.volatile.shared.f64 	[%r13], %fd39;
+
+BB4_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB4_35;
+
+	ld.shared.f64 	%fd40, [memory];
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.u32 	%rd7, %r6, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f64 	[%rd8], %fd40;
+
+BB4_35:
+	ret;
+}
+
+	// .globl	reduce_row_sum_f
+.visible .entry reduce_row_sum_f(
+	.param .u64 reduce_row_sum_f_param_0,
+	.param .u64 reduce_row_sum_f_param_1,
+	.param .u32 reduce_row_sum_f_param_2,
+	.param .u32 reduce_row_sum_f_param_3
+)
+{
+	.reg .pred 	%p<20>;
+	.reg .f32 	%f<56>;
+	.reg .b32 	%r<72>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [reduce_row_sum_f_param_0];
+	ld.param.u64 	%rd2, [reduce_row_sum_f_param_1];
+	ld.param.u32 	%r5, [reduce_row_sum_f_param_2];
+	ld.param.u32 	%r4, [reduce_row_sum_f_param_3];
+	mov.u32 	%r6, %ctaid.x;
+	setp.ge.u32	%p1, %r6, %r5;
+	@%p1 bra 	BB5_35;
+
+	mov.u32 	%r71, %tid.x;
+	mov.f32 	%f6, 0f00000000;
+	setp.ge.u32	%p2, %r71, %r4;
+	@%p2 bra 	BB5_4;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+
+BB5_3:
+	mad.lo.s32 	%r8, %r6, %r4, %r71;
+	mul.wide.u32 	%rd4, %r8, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f28, [%rd5];
+	add.f32 	%f6, %f6, %f28;
+	mov.u32 	%r9, %ntid.x;
+	add.s32 	%r71, %r9, %r71;
+	setp.lt.u32	%p3, %r71, %r4;
+	@%p3 bra 	BB5_3;
+
+BB5_4:
+	mov.u32 	%r10, %tid.x;
+	shl.b32 	%r11, %r10, 2;
+	mov.u32 	%r12, memory;
+	add.s32 	%r13, %r12, %r11;
+	st.shared.f32 	[%r13], %f6;
+	bar.sync 	0;
+	mov.u32 	%r14, %ntid.x;
+	setp.lt.u32	%p4, %r14, 1024;
+	@%p4 bra 	BB5_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB5_7;
+
+	ld.shared.f32 	%f29, [%r13+2048];
+	add.f32 	%f6, %f6, %f29;
+	st.shared.f32 	[%r13], %f6;
+
+BB5_7:
+	bar.sync 	0;
+
+BB5_8:
+	setp.lt.u32	%p6, %r14, 512;
+	@%p6 bra 	BB5_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB5_11;
+
+	ld.shared.f32 	%f30, [%r13+1024];
+	add.f32 	%f6, %f6, %f30;
+	st.shared.f32 	[%r13], %f6;
+
+BB5_11:
+	bar.sync 	0;
+
+BB5_12:
+	setp.lt.u32	%p8, %r14, 256;
+	@%p8 bra 	BB5_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB5_15;
+
+	ld.shared.f32 	%f31, [%r13+512];
+	add.f32 	%f6, %f6, %f31;
+	st.shared.f32 	[%r13], %f6;
+
+BB5_15:
+	bar.sync 	0;
+
+BB5_16:
+	setp.lt.u32	%p10, %r14, 128;
+	@%p10 bra 	BB5_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB5_19;
+
+	ld.shared.f32 	%f32, [%r13+256];
+	add.f32 	%f6, %f6, %f32;
+	st.shared.f32 	[%r13], %f6;
+
+BB5_19:
+	bar.sync 	0;
+
+BB5_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB5_33;
+
+	setp.lt.u32	%p13, %r14, 64;
+	@%p13 bra 	BB5_23;
+
+	ld.volatile.shared.f32 	%f33, [%r13+128];
+	add.f32 	%f6, %f6, %f33;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB5_23:
+	setp.lt.u32	%p14, %r14, 32;
+	@%p14 bra 	BB5_25;
+
+	ld.volatile.shared.f32 	%f34, [%r13+64];
+	add.f32 	%f6, %f6, %f34;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB5_25:
+	setp.lt.u32	%p15, %r14, 16;
+	@%p15 bra 	BB5_27;
+
+	ld.volatile.shared.f32 	%f35, [%r13+32];
+	add.f32 	%f6, %f6, %f35;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB5_27:
+	setp.lt.u32	%p16, %r14, 8;
+	@%p16 bra 	BB5_29;
+
+	ld.volatile.shared.f32 	%f36, [%r13+16];
+	add.f32 	%f6, %f6, %f36;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB5_29:
+	setp.lt.u32	%p17, %r14, 4;
+	@%p17 bra 	BB5_31;
+
+	ld.volatile.shared.f32 	%f37, [%r13+8];
+	add.f32 	%f6, %f6, %f37;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB5_31:
+	setp.lt.u32	%p18, %r14, 2;
+	@%p18 bra 	BB5_33;
+
+	ld.volatile.shared.f32 	%f38, [%r13+4];
+	add.f32 	%f39, %f6, %f38;
+	st.volatile.shared.f32 	[%r13], %f39;
+
+BB5_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB5_35;
+
+	ld.shared.f32 	%f40, [memory];
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.u32 	%rd7, %r6, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f40;
+
+BB5_35:
+	ret;
+}
+
+	// .globl	reduce_col_sum_d
+.visible .entry reduce_col_sum_d(
+	.param .u64 reduce_col_sum_d_param_0,
+	.param .u64 reduce_col_sum_d_param_1,
+	.param .u32 reduce_col_sum_d_param_2,
+	.param .u32 reduce_col_sum_d_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<11>;
+	.reg .f64 	%fd<9>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd2, [reduce_col_sum_d_param_0];
+	ld.param.u64 	%rd3, [reduce_col_sum_d_param_1];
+	ld.param.u32 	%r5, [reduce_col_sum_d_param_2];
+	ld.param.u32 	%r6, [reduce_col_sum_d_param_3];
+	mov.u32 	%r7, %ntid.x;
+	mov.u32 	%r8, %ctaid.x;
+	mov.u32 	%r9, %tid.x;
+	mad.lo.s32 	%r1, %r7, %r8, %r9;
+	setp.ge.u32	%p1, %r1, %r6;
+	@%p1 bra 	BB6_5;
+
+	mul.lo.s32 	%r2, %r6, %r5;
+	cvta.to.global.u64 	%rd1, %rd2;
+	mov.f64 	%fd8, 0d0000000000000000;
+	setp.ge.u32	%p2, %r1, %r2;
+	@%p2 bra 	BB6_4;
+
+	mov.u32 	%r10, %r1;
+
+BB6_3:
+	mul.wide.u32 	%rd4, %r10, 8;
+	add.s64 	%rd5, %rd1, %rd4;
+	ld.global.f64 	%fd6, [%rd5];
+	add.f64 	%fd8, %fd8, %fd6;
+	add.s32 	%r10, %r10, %r6;
+	setp.lt.u32	%p3, %r10, %r2;
+	@%p3 bra 	BB6_3;
+
+BB6_4:
+	cvta.to.global.u64 	%rd6, %rd3;
+	mul.wide.u32 	%rd7, %r1, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f64 	[%rd8], %fd8;
+
+BB6_5:
+	ret;
+}
+
+	// .globl	reduce_col_sum_f
+.visible .entry reduce_col_sum_f(
+	.param .u64 reduce_col_sum_f_param_0,
+	.param .u64 reduce_col_sum_f_param_1,
+	.param .u32 reduce_col_sum_f_param_2,
+	.param .u32 reduce_col_sum_f_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<9>;
+	.reg .b32 	%r<11>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd2, [reduce_col_sum_f_param_0];
+	ld.param.u64 	%rd3, [reduce_col_sum_f_param_1];
+	ld.param.u32 	%r5, [reduce_col_sum_f_param_2];
+	ld.param.u32 	%r6, [reduce_col_sum_f_param_3];
+	mov.u32 	%r7, %ntid.x;
+	mov.u32 	%r8, %ctaid.x;
+	mov.u32 	%r9, %tid.x;
+	mad.lo.s32 	%r1, %r7, %r8, %r9;
+	setp.ge.u32	%p1, %r1, %r6;
+	@%p1 bra 	BB7_5;
+
+	mul.lo.s32 	%r2, %r6, %r5;
+	cvta.to.global.u64 	%rd1, %rd2;
+	mov.f32 	%f8, 0f00000000;
+	setp.ge.u32	%p2, %r1, %r2;
+	@%p2 bra 	BB7_4;
+
+	mov.u32 	%r10, %r1;
+
+BB7_3:
+	mul.wide.u32 	%rd4, %r10, 4;
+	add.s64 	%rd5, %rd1, %rd4;
+	ld.global.f32 	%f6, [%rd5];
+	add.f32 	%f8, %f8, %f6;
+	add.s32 	%r10, %r10, %r6;
+	setp.lt.u32	%p3, %r10, %r2;
+	@%p3 bra 	BB7_3;
+
+BB7_4:
+	cvta.to.global.u64 	%rd6, %rd3;
+	mul.wide.u32 	%rd7, %r1, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f8;
+
+BB7_5:
+	ret;
+}
+
+	// .globl	reduce_max_d
+.visible .entry reduce_max_d(
+	.param .u64 reduce_max_d_param_0,
+	.param .u64 reduce_max_d_param_1,
+	.param .u32 reduce_max_d_param_2
+)
+{
+	.local .align 16 .b8 	__local_depot8[16];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<21>;
+	.reg .b32 	%r<39>;
+	.reg .f64 	%fd<61>;
+	.reg .b64 	%rd<16>;
+
+
+	mov.u64 	%SPL, __local_depot8;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd1, [reduce_max_d_param_0];
+	ld.param.u64 	%rd2, [reduce_max_d_param_1];
+	ld.param.u32 	%r6, [reduce_max_d_param_2];
+	mov.u32 	%r7, %ctaid.x;
+	shl.b32 	%r8, %r7, 1;
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %tid.x;
+	mad.lo.s32 	%r38, %r8, %r9, %r10;
+	mov.f64 	%fd45, 0dFFEFFFFFFFFFFFFF;
+	setp.ge.u32	%p1, %r38, %r6;
+	@%p1 bra 	BB8_4;
+
+BB8_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.u32 	%rd4, %r38, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd30, [%rd5];
+	max.f64 	%fd45, %fd45, %fd30;
+	add.s32 	%r3, %r38, %r9;
+	setp.ge.u32	%p2, %r3, %r6;
+	@%p2 bra 	BB8_3;
+
+	mul.wide.u32 	%rd7, %r3, 8;
+	add.s64 	%rd8, %rd3, %rd7;
+	ld.global.f64 	%fd31, [%rd8];
+	max.f64 	%fd45, %fd45, %fd31;
+
+BB8_3:
+	shl.b32 	%r13, %r9, 1;
+	mov.u32 	%r14, %nctaid.x;
+	mad.lo.s32 	%r38, %r13, %r14, %r38;
+	setp.lt.u32	%p3, %r38, %r6;
+	@%p3 bra 	BB8_1;
+
+BB8_4:
+	shl.b32 	%r16, %r10, 3;
+	mov.u32 	%r17, memory;
+	add.s32 	%r5, %r17, %r16;
+	st.shared.f64 	[%r5], %fd45;
+	bar.sync 	0;
+	setp.lt.u32	%p4, %r9, 1024;
+	@%p4 bra 	BB8_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB8_7;
+
+	ld.shared.f64 	%fd32, [%r5+4096];
+	max.f64 	%fd45, %fd45, %fd32;
+	st.shared.f64 	[%r5], %fd45;
+
+BB8_7:
+	bar.sync 	0;
+
+BB8_8:
+	setp.lt.u32	%p6, %r9, 512;
+	@%p6 bra 	BB8_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB8_11;
+
+	ld.shared.f64 	%fd33, [%r5+2048];
+	max.f64 	%fd45, %fd45, %fd33;
+	st.shared.f64 	[%r5], %fd45;
+
+BB8_11:
+	bar.sync 	0;
+
+BB8_12:
+	setp.lt.u32	%p8, %r9, 256;
+	@%p8 bra 	BB8_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB8_15;
+
+	ld.shared.f64 	%fd34, [%r5+1024];
+	max.f64 	%fd45, %fd45, %fd34;
+	st.shared.f64 	[%r5], %fd45;
+
+BB8_15:
+	bar.sync 	0;
+
+BB8_16:
+	setp.lt.u32	%p10, %r9, 128;
+	@%p10 bra 	BB8_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB8_19;
+
+	ld.shared.f64 	%fd35, [%r5+512];
+	max.f64 	%fd45, %fd45, %fd35;
+	st.shared.f64 	[%r5], %fd45;
+
+BB8_19:
+	bar.sync 	0;
+
+BB8_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB8_33;
+
+	setp.lt.u32	%p13, %r9, 64;
+	@%p13 bra 	BB8_23;
+
+	ld.volatile.shared.f64 	%fd36, [%r5+256];
+	max.f64 	%fd45, %fd45, %fd36;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB8_23:
+	setp.lt.u32	%p14, %r9, 32;
+	@%p14 bra 	BB8_25;
+
+	ld.volatile.shared.f64 	%fd37, [%r5+128];
+	max.f64 	%fd45, %fd45, %fd37;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB8_25:
+	setp.lt.u32	%p15, %r9, 16;
+	@%p15 bra 	BB8_27;
+
+	ld.volatile.shared.f64 	%fd38, [%r5+64];
+	max.f64 	%fd45, %fd45, %fd38;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB8_27:
+	setp.lt.u32	%p16, %r9, 8;
+	@%p16 bra 	BB8_29;
+
+	ld.volatile.shared.f64 	%fd39, [%r5+32];
+	max.f64 	%fd45, %fd45, %fd39;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB8_29:
+	setp.lt.u32	%p17, %r9, 4;
+	@%p17 bra 	BB8_31;
+
+	ld.volatile.shared.f64 	%fd40, [%r5+16];
+	max.f64 	%fd45, %fd45, %fd40;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB8_31:
+	setp.lt.u32	%p18, %r9, 2;
+	@%p18 bra 	BB8_33;
+
+	ld.volatile.shared.f64 	%fd41, [%r5+8];
+	max.f64 	%fd42, %fd45, %fd41;
+	st.volatile.shared.f64 	[%r5], %fd42;
+
+BB8_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB8_37;
+
+	mov.u32 	%r34, %nctaid.x;
+	setp.gt.u32	%p20, %r34, 9;
+	@%p20 bra 	BB8_36;
+
+	ld.shared.f64 	%fd43, [memory];
+	add.u64 	%rd9, %SP, 0;
+	add.u64 	%rd10, %SPL, 0;
+	st.local.u32 	[%rd10], %r7;
+	st.local.f64 	[%rd10+8], %fd43;
+	mov.u64 	%rd11, $str;
+	cvta.global.u64 	%rd12, %rd11;
+	// Callseq Start 2
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd12;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd9;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r36, [retval0+0];
+	
+	//{
+	}// Callseq End 2
+
+BB8_36:
+	ld.shared.f64 	%fd44, [memory];
+	cvta.to.global.u64 	%rd13, %rd2;
+	mul.wide.u32 	%rd14, %r7, 8;
+	add.s64 	%rd15, %rd13, %rd14;
+	st.global.f64 	[%rd15], %fd44;
+
+BB8_37:
+	ret;
+}
+
+	// .globl	reduce_max_f
+.visible .entry reduce_max_f(
+	.param .u64 reduce_max_f_param_0,
+	.param .u64 reduce_max_f_param_1,
+	.param .u32 reduce_max_f_param_2
+)
+{
+	.local .align 16 .b8 	__local_depot9[16];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<21>;
+	.reg .f32 	%f<61>;
+	.reg .b32 	%r<39>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<16>;
+
+
+	mov.u64 	%SPL, __local_depot9;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd1, [reduce_max_f_param_0];
+	ld.param.u64 	%rd2, [reduce_max_f_param_1];
+	ld.param.u32 	%r6, [reduce_max_f_param_2];
+	mov.u32 	%r7, %ctaid.x;
+	shl.b32 	%r8, %r7, 1;
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %tid.x;
+	mad.lo.s32 	%r38, %r8, %r9, %r10;
+	mov.f32 	%f45, 0fFF7FFFFF;
+	setp.ge.u32	%p1, %r38, %r6;
+	@%p1 bra 	BB9_4;
+
+BB9_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.u32 	%rd4, %r38, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f30, [%rd5];
+	max.f32 	%f45, %f45, %f30;
+	add.s32 	%r3, %r38, %r9;
+	setp.ge.u32	%p2, %r3, %r6;
+	@%p2 bra 	BB9_3;
+
+	mul.wide.u32 	%rd7, %r3, 4;
+	add.s64 	%rd8, %rd3, %rd7;
+	ld.global.f32 	%f31, [%rd8];
+	max.f32 	%f45, %f45, %f31;
+
+BB9_3:
+	shl.b32 	%r13, %r9, 1;
+	mov.u32 	%r14, %nctaid.x;
+	mad.lo.s32 	%r38, %r13, %r14, %r38;
+	setp.lt.u32	%p3, %r38, %r6;
+	@%p3 bra 	BB9_1;
+
+BB9_4:
+	shl.b32 	%r16, %r10, 2;
+	mov.u32 	%r17, memory;
+	add.s32 	%r5, %r17, %r16;
+	st.shared.f32 	[%r5], %f45;
+	bar.sync 	0;
+	setp.lt.u32	%p4, %r9, 1024;
+	@%p4 bra 	BB9_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB9_7;
+
+	ld.shared.f32 	%f32, [%r5+2048];
+	max.f32 	%f45, %f45, %f32;
+	st.shared.f32 	[%r5], %f45;
+
+BB9_7:
+	bar.sync 	0;
+
+BB9_8:
+	setp.lt.u32	%p6, %r9, 512;
+	@%p6 bra 	BB9_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB9_11;
+
+	ld.shared.f32 	%f33, [%r5+1024];
+	max.f32 	%f45, %f45, %f33;
+	st.shared.f32 	[%r5], %f45;
+
+BB9_11:
+	bar.sync 	0;
+
+BB9_12:
+	setp.lt.u32	%p8, %r9, 256;
+	@%p8 bra 	BB9_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB9_15;
+
+	ld.shared.f32 	%f34, [%r5+512];
+	max.f32 	%f45, %f45, %f34;
+	st.shared.f32 	[%r5], %f45;
+
+BB9_15:
+	bar.sync 	0;
+
+BB9_16:
+	setp.lt.u32	%p10, %r9, 128;
+	@%p10 bra 	BB9_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB9_19;
+
+	ld.shared.f32 	%f35, [%r5+256];
+	max.f32 	%f45, %f45, %f35;
+	st.shared.f32 	[%r5], %f45;
+
+BB9_19:
+	bar.sync 	0;
+
+BB9_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB9_33;
+
+	setp.lt.u32	%p13, %r9, 64;
+	@%p13 bra 	BB9_23;
+
+	ld.volatile.shared.f32 	%f36, [%r5+128];
+	max.f32 	%f45, %f45, %f36;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB9_23:
+	setp.lt.u32	%p14, %r9, 32;
+	@%p14 bra 	BB9_25;
+
+	ld.volatile.shared.f32 	%f37, [%r5+64];
+	max.f32 	%f45, %f45, %f37;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB9_25:
+	setp.lt.u32	%p15, %r9, 16;
+	@%p15 bra 	BB9_27;
+
+	ld.volatile.shared.f32 	%f38, [%r5+32];
+	max.f32 	%f45, %f45, %f38;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB9_27:
+	setp.lt.u32	%p16, %r9, 8;
+	@%p16 bra 	BB9_29;
+
+	ld.volatile.shared.f32 	%f39, [%r5+16];
+	max.f32 	%f45, %f45, %f39;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB9_29:
+	setp.lt.u32	%p17, %r9, 4;
+	@%p17 bra 	BB9_31;
+
+	ld.volatile.shared.f32 	%f40, [%r5+8];
+	max.f32 	%f45, %f45, %f40;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB9_31:
+	setp.lt.u32	%p18, %r9, 2;
+	@%p18 bra 	BB9_33;
+
+	ld.volatile.shared.f32 	%f41, [%r5+4];
+	max.f32 	%f42, %f45, %f41;
+	st.volatile.shared.f32 	[%r5], %f42;
+
+BB9_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB9_37;
+
+	mov.u32 	%r34, %nctaid.x;
+	setp.gt.u32	%p20, %r34, 9;
+	@%p20 bra 	BB9_36;
+
+	ld.shared.f32 	%f43, [memory];
+	cvt.f64.f32	%fd1, %f43;
+	add.u64 	%rd9, %SP, 0;
+	add.u64 	%rd10, %SPL, 0;
+	st.local.u32 	[%rd10], %r7;
+	st.local.f64 	[%rd10+8], %fd1;
+	mov.u64 	%rd11, $str;
+	cvta.global.u64 	%rd12, %rd11;
+	// Callseq Start 3
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd12;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd9;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r36, [retval0+0];
+	
+	//{
+	}// Callseq End 3
+
+BB9_36:
+	ld.shared.f32 	%f44, [memory];
+	cvta.to.global.u64 	%rd13, %rd2;
+	mul.wide.u32 	%rd14, %r7, 4;
+	add.s64 	%rd15, %rd13, %rd14;
+	st.global.f32 	[%rd15], %f44;
+
+BB9_37:
+	ret;
+}
+
+	// .globl	reduce_row_max_d
+.visible .entry reduce_row_max_d(
+	.param .u64 reduce_row_max_d_param_0,
+	.param .u64 reduce_row_max_d_param_1,
+	.param .u32 reduce_row_max_d_param_2,
+	.param .u32 reduce_row_max_d_param_3
+)
+{
+	.reg .pred 	%p<20>;
+	.reg .b32 	%r<72>;
+	.reg .f64 	%fd<56>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [reduce_row_max_d_param_0];
+	ld.param.u64 	%rd2, [reduce_row_max_d_param_1];
+	ld.param.u32 	%r5, [reduce_row_max_d_param_2];
+	ld.param.u32 	%r4, [reduce_row_max_d_param_3];
+	mov.u32 	%r6, %ctaid.x;
+	setp.ge.u32	%p1, %r6, %r5;
+	@%p1 bra 	BB10_35;
+
+	mov.u32 	%r71, %tid.x;
+	mov.f64 	%fd6, 0dFFEFFFFFFFFFFFFF;
+	setp.ge.u32	%p2, %r71, %r4;
+	@%p2 bra 	BB10_4;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+
+BB10_3:
+	mad.lo.s32 	%r8, %r6, %r4, %r71;
+	mul.wide.u32 	%rd4, %r8, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd28, [%rd5];
+	max.f64 	%fd6, %fd6, %fd28;
+	mov.u32 	%r9, %ntid.x;
+	add.s32 	%r71, %r9, %r71;
+	setp.lt.u32	%p3, %r71, %r4;
+	@%p3 bra 	BB10_3;
+
+BB10_4:
+	mov.u32 	%r10, %tid.x;
+	shl.b32 	%r11, %r10, 3;
+	mov.u32 	%r12, memory;
+	add.s32 	%r13, %r12, %r11;
+	st.shared.f64 	[%r13], %fd6;
+	bar.sync 	0;
+	mov.u32 	%r14, %ntid.x;
+	setp.lt.u32	%p4, %r14, 1024;
+	@%p4 bra 	BB10_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB10_7;
+
+	ld.shared.f64 	%fd29, [%r13+4096];
+	max.f64 	%fd6, %fd6, %fd29;
+	st.shared.f64 	[%r13], %fd6;
+
+BB10_7:
+	bar.sync 	0;
+
+BB10_8:
+	setp.lt.u32	%p6, %r14, 512;
+	@%p6 bra 	BB10_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB10_11;
+
+	ld.shared.f64 	%fd30, [%r13+2048];
+	max.f64 	%fd6, %fd6, %fd30;
+	st.shared.f64 	[%r13], %fd6;
+
+BB10_11:
+	bar.sync 	0;
+
+BB10_12:
+	setp.lt.u32	%p8, %r14, 256;
+	@%p8 bra 	BB10_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB10_15;
+
+	ld.shared.f64 	%fd31, [%r13+1024];
+	max.f64 	%fd6, %fd6, %fd31;
+	st.shared.f64 	[%r13], %fd6;
+
+BB10_15:
+	bar.sync 	0;
+
+BB10_16:
+	setp.lt.u32	%p10, %r14, 128;
+	@%p10 bra 	BB10_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB10_19;
+
+	ld.shared.f64 	%fd32, [%r13+512];
+	max.f64 	%fd6, %fd6, %fd32;
+	st.shared.f64 	[%r13], %fd6;
+
+BB10_19:
+	bar.sync 	0;
+
+BB10_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB10_33;
+
+	setp.lt.u32	%p13, %r14, 64;
+	@%p13 bra 	BB10_23;
+
+	ld.volatile.shared.f64 	%fd33, [%r13+256];
+	max.f64 	%fd6, %fd6, %fd33;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB10_23:
+	setp.lt.u32	%p14, %r14, 32;
+	@%p14 bra 	BB10_25;
+
+	ld.volatile.shared.f64 	%fd34, [%r13+128];
+	max.f64 	%fd6, %fd6, %fd34;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB10_25:
+	setp.lt.u32	%p15, %r14, 16;
+	@%p15 bra 	BB10_27;
+
+	ld.volatile.shared.f64 	%fd35, [%r13+64];
+	max.f64 	%fd6, %fd6, %fd35;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB10_27:
+	setp.lt.u32	%p16, %r14, 8;
+	@%p16 bra 	BB10_29;
+
+	ld.volatile.shared.f64 	%fd36, [%r13+32];
+	max.f64 	%fd6, %fd6, %fd36;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB10_29:
+	setp.lt.u32	%p17, %r14, 4;
+	@%p17 bra 	BB10_31;
+
+	ld.volatile.shared.f64 	%fd37, [%r13+16];
+	max.f64 	%fd6, %fd6, %fd37;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB10_31:
+	setp.lt.u32	%p18, %r14, 2;
+	@%p18 bra 	BB10_33;
+
+	ld.volatile.shared.f64 	%fd38, [%r13+8];
+	max.f64 	%fd39, %fd6, %fd38;
+	st.volatile.shared.f64 	[%r13], %fd39;
+
+BB10_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB10_35;
+
+	ld.shared.f64 	%fd40, [memory];
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.u32 	%rd7, %r6, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f64 	[%rd8], %fd40;
+
+BB10_35:
+	ret;
+}
+
+	// .globl	reduce_row_max_f
+.visible .entry reduce_row_max_f(
+	.param .u64 reduce_row_max_f_param_0,
+	.param .u64 reduce_row_max_f_param_1,
+	.param .u32 reduce_row_max_f_param_2,
+	.param .u32 reduce_row_max_f_param_3
+)
+{
+	.reg .pred 	%p<20>;
+	.reg .f32 	%f<56>;
+	.reg .b32 	%r<72>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [reduce_row_max_f_param_0];
+	ld.param.u64 	%rd2, [reduce_row_max_f_param_1];
+	ld.param.u32 	%r5, [reduce_row_max_f_param_2];
+	ld.param.u32 	%r4, [reduce_row_max_f_param_3];
+	mov.u32 	%r6, %ctaid.x;
+	setp.ge.u32	%p1, %r6, %r5;
+	@%p1 bra 	BB11_35;
+
+	mov.u32 	%r71, %tid.x;
+	mov.f32 	%f6, 0fFF7FFFFF;
+	setp.ge.u32	%p2, %r71, %r4;
+	@%p2 bra 	BB11_4;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+
+BB11_3:
+	mad.lo.s32 	%r8, %r6, %r4, %r71;
+	mul.wide.u32 	%rd4, %r8, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f28, [%rd5];
+	max.f32 	%f6, %f6, %f28;
+	mov.u32 	%r9, %ntid.x;
+	add.s32 	%r71, %r9, %r71;
+	setp.lt.u32	%p3, %r71, %r4;
+	@%p3 bra 	BB11_3;
+
+BB11_4:
+	mov.u32 	%r10, %tid.x;
+	shl.b32 	%r11, %r10, 2;
+	mov.u32 	%r12, memory;
+	add.s32 	%r13, %r12, %r11;
+	st.shared.f32 	[%r13], %f6;
+	bar.sync 	0;
+	mov.u32 	%r14, %ntid.x;
+	setp.lt.u32	%p4, %r14, 1024;
+	@%p4 bra 	BB11_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB11_7;
+
+	ld.shared.f32 	%f29, [%r13+2048];
+	max.f32 	%f6, %f6, %f29;
+	st.shared.f32 	[%r13], %f6;
+
+BB11_7:
+	bar.sync 	0;
+
+BB11_8:
+	setp.lt.u32	%p6, %r14, 512;
+	@%p6 bra 	BB11_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB11_11;
+
+	ld.shared.f32 	%f30, [%r13+1024];
+	max.f32 	%f6, %f6, %f30;
+	st.shared.f32 	[%r13], %f6;
+
+BB11_11:
+	bar.sync 	0;
+
+BB11_12:
+	setp.lt.u32	%p8, %r14, 256;
+	@%p8 bra 	BB11_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB11_15;
+
+	ld.shared.f32 	%f31, [%r13+512];
+	max.f32 	%f6, %f6, %f31;
+	st.shared.f32 	[%r13], %f6;
+
+BB11_15:
+	bar.sync 	0;
+
+BB11_16:
+	setp.lt.u32	%p10, %r14, 128;
+	@%p10 bra 	BB11_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB11_19;
+
+	ld.shared.f32 	%f32, [%r13+256];
+	max.f32 	%f6, %f6, %f32;
+	st.shared.f32 	[%r13], %f6;
+
+BB11_19:
+	bar.sync 	0;
+
+BB11_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB11_33;
+
+	setp.lt.u32	%p13, %r14, 64;
+	@%p13 bra 	BB11_23;
+
+	ld.volatile.shared.f32 	%f33, [%r13+128];
+	max.f32 	%f6, %f6, %f33;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB11_23:
+	setp.lt.u32	%p14, %r14, 32;
+	@%p14 bra 	BB11_25;
+
+	ld.volatile.shared.f32 	%f34, [%r13+64];
+	max.f32 	%f6, %f6, %f34;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB11_25:
+	setp.lt.u32	%p15, %r14, 16;
+	@%p15 bra 	BB11_27;
+
+	ld.volatile.shared.f32 	%f35, [%r13+32];
+	max.f32 	%f6, %f6, %f35;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB11_27:
+	setp.lt.u32	%p16, %r14, 8;
+	@%p16 bra 	BB11_29;
+
+	ld.volatile.shared.f32 	%f36, [%r13+16];
+	max.f32 	%f6, %f6, %f36;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB11_29:
+	setp.lt.u32	%p17, %r14, 4;
+	@%p17 bra 	BB11_31;
+
+	ld.volatile.shared.f32 	%f37, [%r13+8];
+	max.f32 	%f6, %f6, %f37;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB11_31:
+	setp.lt.u32	%p18, %r14, 2;
+	@%p18 bra 	BB11_33;
+
+	ld.volatile.shared.f32 	%f38, [%r13+4];
+	max.f32 	%f39, %f6, %f38;
+	st.volatile.shared.f32 	[%r13], %f39;
+
+BB11_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB11_35;
+
+	ld.shared.f32 	%f40, [memory];
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.u32 	%rd7, %r6, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f40;
+
+BB11_35:
+	ret;
+}
+
+	// .globl	reduce_col_max_d
+.visible .entry reduce_col_max_d(
+	.param .u64 reduce_col_max_d_param_0,
+	.param .u64 reduce_col_max_d_param_1,
+	.param .u32 reduce_col_max_d_param_2,
+	.param .u32 reduce_col_max_d_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<11>;
+	.reg .f64 	%fd<9>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd2, [reduce_col_max_d_param_0];
+	ld.param.u64 	%rd3, [reduce_col_max_d_param_1];
+	ld.param.u32 	%r5, [reduce_col_max_d_param_2];
+	ld.param.u32 	%r6, [reduce_col_max_d_param_3];
+	mov.u32 	%r7, %ntid.x;
+	mov.u32 	%r8, %ctaid.x;
+	mov.u32 	%r9, %tid.x;
+	mad.lo.s32 	%r1, %r7, %r8, %r9;
+	setp.ge.u32	%p1, %r1, %r6;
+	@%p1 bra 	BB12_5;
+
+	mul.lo.s32 	%r2, %r6, %r5;
+	cvta.to.global.u64 	%rd1, %rd2;
+	mov.f64 	%fd8, 0dFFEFFFFFFFFFFFFF;
+	setp.ge.u32	%p2, %r1, %r2;
+	@%p2 bra 	BB12_4;
+
+	mov.u32 	%r10, %r1;
+
+BB12_3:
+	mul.wide.u32 	%rd4, %r10, 8;
+	add.s64 	%rd5, %rd1, %rd4;
+	ld.global.f64 	%fd6, [%rd5];
+	max.f64 	%fd8, %fd8, %fd6;
+	add.s32 	%r10, %r10, %r6;
+	setp.lt.u32	%p3, %r10, %r2;
+	@%p3 bra 	BB12_3;
+
+BB12_4:
+	cvta.to.global.u64 	%rd6, %rd3;
+	mul.wide.u32 	%rd7, %r1, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f64 	[%rd8], %fd8;
+
+BB12_5:
+	ret;
+}
+
+	// .globl	reduce_col_max_f
+.visible .entry reduce_col_max_f(
+	.param .u64 reduce_col_max_f_param_0,
+	.param .u64 reduce_col_max_f_param_1,
+	.param .u32 reduce_col_max_f_param_2,
+	.param .u32 reduce_col_max_f_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<9>;
+	.reg .b32 	%r<11>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd2, [reduce_col_max_f_param_0];
+	ld.param.u64 	%rd3, [reduce_col_max_f_param_1];
+	ld.param.u32 	%r5, [reduce_col_max_f_param_2];
+	ld.param.u32 	%r6, [reduce_col_max_f_param_3];
+	mov.u32 	%r7, %ntid.x;
+	mov.u32 	%r8, %ctaid.x;
+	mov.u32 	%r9, %tid.x;
+	mad.lo.s32 	%r1, %r7, %r8, %r9;
+	setp.ge.u32	%p1, %r1, %r6;
+	@%p1 bra 	BB13_5;
+
+	mul.lo.s32 	%r2, %r6, %r5;
+	cvta.to.global.u64 	%rd1, %rd2;
+	mov.f32 	%f8, 0fFF7FFFFF;
+	setp.ge.u32	%p2, %r1, %r2;
+	@%p2 bra 	BB13_4;
+
+	mov.u32 	%r10, %r1;
+
+BB13_3:
+	mul.wide.u32 	%rd4, %r10, 4;
+	add.s64 	%rd5, %rd1, %rd4;
+	ld.global.f32 	%f6, [%rd5];
+	max.f32 	%f8, %f8, %f6;
+	add.s32 	%r10, %r10, %r6;
+	setp.lt.u32	%p3, %r10, %r2;
+	@%p3 bra 	BB13_3;
+
+BB13_4:
+	cvta.to.global.u64 	%rd6, %rd3;
+	mul.wide.u32 	%rd7, %r1, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f8;
+
+BB13_5:
+	ret;
+}
+
+	// .globl	reduce_min_d
+.visible .entry reduce_min_d(
+	.param .u64 reduce_min_d_param_0,
+	.param .u64 reduce_min_d_param_1,
+	.param .u32 reduce_min_d_param_2
+)
+{
+	.local .align 16 .b8 	__local_depot14[16];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<21>;
+	.reg .b32 	%r<39>;
+	.reg .f64 	%fd<61>;
+	.reg .b64 	%rd<16>;
+
+
+	mov.u64 	%SPL, __local_depot14;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd1, [reduce_min_d_param_0];
+	ld.param.u64 	%rd2, [reduce_min_d_param_1];
+	ld.param.u32 	%r6, [reduce_min_d_param_2];
+	mov.u32 	%r7, %ctaid.x;
+	shl.b32 	%r8, %r7, 1;
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %tid.x;
+	mad.lo.s32 	%r38, %r8, %r9, %r10;
+	mov.f64 	%fd45, 0d7FEFFFFFFFFFFFFF;
+	setp.ge.u32	%p1, %r38, %r6;
+	@%p1 bra 	BB14_4;
+
+BB14_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.u32 	%rd4, %r38, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd30, [%rd5];
+	min.f64 	%fd45, %fd45, %fd30;
+	add.s32 	%r3, %r38, %r9;
+	setp.ge.u32	%p2, %r3, %r6;
+	@%p2 bra 	BB14_3;
+
+	mul.wide.u32 	%rd7, %r3, 8;
+	add.s64 	%rd8, %rd3, %rd7;
+	ld.global.f64 	%fd31, [%rd8];
+	min.f64 	%fd45, %fd45, %fd31;
+
+BB14_3:
+	shl.b32 	%r13, %r9, 1;
+	mov.u32 	%r14, %nctaid.x;
+	mad.lo.s32 	%r38, %r13, %r14, %r38;
+	setp.lt.u32	%p3, %r38, %r6;
+	@%p3 bra 	BB14_1;
+
+BB14_4:
+	shl.b32 	%r16, %r10, 3;
+	mov.u32 	%r17, memory;
+	add.s32 	%r5, %r17, %r16;
+	st.shared.f64 	[%r5], %fd45;
+	bar.sync 	0;
+	setp.lt.u32	%p4, %r9, 1024;
+	@%p4 bra 	BB14_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB14_7;
+
+	ld.shared.f64 	%fd32, [%r5+4096];
+	min.f64 	%fd45, %fd45, %fd32;
+	st.shared.f64 	[%r5], %fd45;
+
+BB14_7:
+	bar.sync 	0;
+
+BB14_8:
+	setp.lt.u32	%p6, %r9, 512;
+	@%p6 bra 	BB14_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB14_11;
+
+	ld.shared.f64 	%fd33, [%r5+2048];
+	min.f64 	%fd45, %fd45, %fd33;
+	st.shared.f64 	[%r5], %fd45;
+
+BB14_11:
+	bar.sync 	0;
+
+BB14_12:
+	setp.lt.u32	%p8, %r9, 256;
+	@%p8 bra 	BB14_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB14_15;
+
+	ld.shared.f64 	%fd34, [%r5+1024];
+	min.f64 	%fd45, %fd45, %fd34;
+	st.shared.f64 	[%r5], %fd45;
+
+BB14_15:
+	bar.sync 	0;
+
+BB14_16:
+	setp.lt.u32	%p10, %r9, 128;
+	@%p10 bra 	BB14_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB14_19;
+
+	ld.shared.f64 	%fd35, [%r5+512];
+	min.f64 	%fd45, %fd45, %fd35;
+	st.shared.f64 	[%r5], %fd45;
+
+BB14_19:
+	bar.sync 	0;
+
+BB14_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB14_33;
+
+	setp.lt.u32	%p13, %r9, 64;
+	@%p13 bra 	BB14_23;
+
+	ld.volatile.shared.f64 	%fd36, [%r5+256];
+	min.f64 	%fd45, %fd45, %fd36;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB14_23:
+	setp.lt.u32	%p14, %r9, 32;
+	@%p14 bra 	BB14_25;
+
+	ld.volatile.shared.f64 	%fd37, [%r5+128];
+	min.f64 	%fd45, %fd45, %fd37;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB14_25:
+	setp.lt.u32	%p15, %r9, 16;
+	@%p15 bra 	BB14_27;
+
+	ld.volatile.shared.f64 	%fd38, [%r5+64];
+	min.f64 	%fd45, %fd45, %fd38;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB14_27:
+	setp.lt.u32	%p16, %r9, 8;
+	@%p16 bra 	BB14_29;
+
+	ld.volatile.shared.f64 	%fd39, [%r5+32];
+	min.f64 	%fd45, %fd45, %fd39;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB14_29:
+	setp.lt.u32	%p17, %r9, 4;
+	@%p17 bra 	BB14_31;
+
+	ld.volatile.shared.f64 	%fd40, [%r5+16];
+	min.f64 	%fd45, %fd45, %fd40;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB14_31:
+	setp.lt.u32	%p18, %r9, 2;
+	@%p18 bra 	BB14_33;
+
+	ld.volatile.shared.f64 	%fd41, [%r5+8];
+	min.f64 	%fd42, %fd45, %fd41;
+	st.volatile.shared.f64 	[%r5], %fd42;
+
+BB14_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB14_37;
+
+	mov.u32 	%r34, %nctaid.x;
+	setp.gt.u32	%p20, %r34, 9;
+	@%p20 bra 	BB14_36;
+
+	ld.shared.f64 	%fd43, [memory];
+	add.u64 	%rd9, %SP, 0;
+	add.u64 	%rd10, %SPL, 0;
+	st.local.u32 	[%rd10], %r7;
+	st.local.f64 	[%rd10+8], %fd43;
+	mov.u64 	%rd11, $str;
+	cvta.global.u64 	%rd12, %rd11;
+	// Callseq Start 4
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd12;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd9;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r36, [retval0+0];
+	
+	//{
+	}// Callseq End 4
+
+BB14_36:
+	ld.shared.f64 	%fd44, [memory];
+	cvta.to.global.u64 	%rd13, %rd2;
+	mul.wide.u32 	%rd14, %r7, 8;
+	add.s64 	%rd15, %rd13, %rd14;
+	st.global.f64 	[%rd15], %fd44;
+
+BB14_37:
+	ret;
+}
+
+	// .globl	reduce_min_f
+.visible .entry reduce_min_f(
+	.param .u64 reduce_min_f_param_0,
+	.param .u64 reduce_min_f_param_1,
+	.param .u32 reduce_min_f_param_2
+)
+{
+	.local .align 16 .b8 	__local_depot15[16];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<21>;
+	.reg .f32 	%f<61>;
+	.reg .b32 	%r<39>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<16>;
+
+
+	mov.u64 	%SPL, __local_depot15;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd1, [reduce_min_f_param_0];
+	ld.param.u64 	%rd2, [reduce_min_f_param_1];
+	ld.param.u32 	%r6, [reduce_min_f_param_2];
+	mov.u32 	%r7, %ctaid.x;
+	shl.b32 	%r8, %r7, 1;
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %tid.x;
+	mad.lo.s32 	%r38, %r8, %r9, %r10;
+	mov.f32 	%f45, 0f7F7FFFFF;
+	setp.ge.u32	%p1, %r38, %r6;
+	@%p1 bra 	BB15_4;
+
+BB15_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.u32 	%rd4, %r38, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f30, [%rd5];
+	min.f32 	%f45, %f45, %f30;
+	add.s32 	%r3, %r38, %r9;
+	setp.ge.u32	%p2, %r3, %r6;
+	@%p2 bra 	BB15_3;
+
+	mul.wide.u32 	%rd7, %r3, 4;
+	add.s64 	%rd8, %rd3, %rd7;
+	ld.global.f32 	%f31, [%rd8];
+	min.f32 	%f45, %f45, %f31;
+
+BB15_3:
+	shl.b32 	%r13, %r9, 1;
+	mov.u32 	%r14, %nctaid.x;
+	mad.lo.s32 	%r38, %r13, %r14, %r38;
+	setp.lt.u32	%p3, %r38, %r6;
+	@%p3 bra 	BB15_1;
+
+BB15_4:
+	shl.b32 	%r16, %r10, 2;
+	mov.u32 	%r17, memory;
+	add.s32 	%r5, %r17, %r16;
+	st.shared.f32 	[%r5], %f45;
+	bar.sync 	0;
+	setp.lt.u32	%p4, %r9, 1024;
+	@%p4 bra 	BB15_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB15_7;
+
+	ld.shared.f32 	%f32, [%r5+2048];
+	min.f32 	%f45, %f45, %f32;
+	st.shared.f32 	[%r5], %f45;
+
+BB15_7:
+	bar.sync 	0;
+
+BB15_8:
+	setp.lt.u32	%p6, %r9, 512;
+	@%p6 bra 	BB15_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB15_11;
+
+	ld.shared.f32 	%f33, [%r5+1024];
+	min.f32 	%f45, %f45, %f33;
+	st.shared.f32 	[%r5], %f45;
+
+BB15_11:
+	bar.sync 	0;
+
+BB15_12:
+	setp.lt.u32	%p8, %r9, 256;
+	@%p8 bra 	BB15_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB15_15;
+
+	ld.shared.f32 	%f34, [%r5+512];
+	min.f32 	%f45, %f45, %f34;
+	st.shared.f32 	[%r5], %f45;
+
+BB15_15:
+	bar.sync 	0;
+
+BB15_16:
+	setp.lt.u32	%p10, %r9, 128;
+	@%p10 bra 	BB15_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB15_19;
+
+	ld.shared.f32 	%f35, [%r5+256];
+	min.f32 	%f45, %f45, %f35;
+	st.shared.f32 	[%r5], %f45;
+
+BB15_19:
+	bar.sync 	0;
+
+BB15_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB15_33;
+
+	setp.lt.u32	%p13, %r9, 64;
+	@%p13 bra 	BB15_23;
+
+	ld.volatile.shared.f32 	%f36, [%r5+128];
+	min.f32 	%f45, %f45, %f36;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB15_23:
+	setp.lt.u32	%p14, %r9, 32;
+	@%p14 bra 	BB15_25;
+
+	ld.volatile.shared.f32 	%f37, [%r5+64];
+	min.f32 	%f45, %f45, %f37;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB15_25:
+	setp.lt.u32	%p15, %r9, 16;
+	@%p15 bra 	BB15_27;
+
+	ld.volatile.shared.f32 	%f38, [%r5+32];
+	min.f32 	%f45, %f45, %f38;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB15_27:
+	setp.lt.u32	%p16, %r9, 8;
+	@%p16 bra 	BB15_29;
+
+	ld.volatile.shared.f32 	%f39, [%r5+16];
+	min.f32 	%f45, %f45, %f39;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB15_29:
+	setp.lt.u32	%p17, %r9, 4;
+	@%p17 bra 	BB15_31;
+
+	ld.volatile.shared.f32 	%f40, [%r5+8];
+	min.f32 	%f45, %f45, %f40;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB15_31:
+	setp.lt.u32	%p18, %r9, 2;
+	@%p18 bra 	BB15_33;
+
+	ld.volatile.shared.f32 	%f41, [%r5+4];
+	min.f32 	%f42, %f45, %f41;
+	st.volatile.shared.f32 	[%r5], %f42;
+
+BB15_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB15_37;
+
+	mov.u32 	%r34, %nctaid.x;
+	setp.gt.u32	%p20, %r34, 9;
+	@%p20 bra 	BB15_36;
+
+	ld.shared.f32 	%f43, [memory];
+	cvt.f64.f32	%fd1, %f43;
+	add.u64 	%rd9, %SP, 0;
+	add.u64 	%rd10, %SPL, 0;
+	st.local.u32 	[%rd10], %r7;
+	st.local.f64 	[%rd10+8], %fd1;
+	mov.u64 	%rd11, $str;
+	cvta.global.u64 	%rd12, %rd11;
+	// Callseq Start 5
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd12;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd9;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r36, [retval0+0];
+	
+	//{
+	}// Callseq End 5
+
+BB15_36:
+	ld.shared.f32 	%f44, [memory];
+	cvta.to.global.u64 	%rd13, %rd2;
+	mul.wide.u32 	%rd14, %r7, 4;
+	add.s64 	%rd15, %rd13, %rd14;
+	st.global.f32 	[%rd15], %f44;
+
+BB15_37:
+	ret;
+}
+
+	// .globl	reduce_row_min_d
+.visible .entry reduce_row_min_d(
+	.param .u64 reduce_row_min_d_param_0,
+	.param .u64 reduce_row_min_d_param_1,
+	.param .u32 reduce_row_min_d_param_2,
+	.param .u32 reduce_row_min_d_param_3
+)
+{
+	.reg .pred 	%p<20>;
+	.reg .b32 	%r<72>;
+	.reg .f64 	%fd<56>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [reduce_row_min_d_param_0];
+	ld.param.u64 	%rd2, [reduce_row_min_d_param_1];
+	ld.param.u32 	%r5, [reduce_row_min_d_param_2];
+	ld.param.u32 	%r4, [reduce_row_min_d_param_3];
+	mov.u32 	%r6, %ctaid.x;
+	setp.ge.u32	%p1, %r6, %r5;
+	@%p1 bra 	BB16_35;
+
+	mov.u32 	%r71, %tid.x;
+	mov.f64 	%fd6, 0d7FEFFFFFFFFFFFFF;
+	setp.ge.u32	%p2, %r71, %r4;
+	@%p2 bra 	BB16_4;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+
+BB16_3:
+	mad.lo.s32 	%r8, %r6, %r4, %r71;
+	mul.wide.u32 	%rd4, %r8, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd28, [%rd5];
+	min.f64 	%fd6, %fd6, %fd28;
+	mov.u32 	%r9, %ntid.x;
+	add.s32 	%r71, %r9, %r71;
+	setp.lt.u32	%p3, %r71, %r4;
+	@%p3 bra 	BB16_3;
+
+BB16_4:
+	mov.u32 	%r10, %tid.x;
+	shl.b32 	%r11, %r10, 3;
+	mov.u32 	%r12, memory;
+	add.s32 	%r13, %r12, %r11;
+	st.shared.f64 	[%r13], %fd6;
+	bar.sync 	0;
+	mov.u32 	%r14, %ntid.x;
+	setp.lt.u32	%p4, %r14, 1024;
+	@%p4 bra 	BB16_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB16_7;
+
+	ld.shared.f64 	%fd29, [%r13+4096];
+	min.f64 	%fd6, %fd6, %fd29;
+	st.shared.f64 	[%r13], %fd6;
+
+BB16_7:
+	bar.sync 	0;
+
+BB16_8:
+	setp.lt.u32	%p6, %r14, 512;
+	@%p6 bra 	BB16_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB16_11;
+
+	ld.shared.f64 	%fd30, [%r13+2048];
+	min.f64 	%fd6, %fd6, %fd30;
+	st.shared.f64 	[%r13], %fd6;
+
+BB16_11:
+	bar.sync 	0;
+
+BB16_12:
+	setp.lt.u32	%p8, %r14, 256;
+	@%p8 bra 	BB16_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB16_15;
+
+	ld.shared.f64 	%fd31, [%r13+1024];
+	min.f64 	%fd6, %fd6, %fd31;
+	st.shared.f64 	[%r13], %fd6;
+
+BB16_15:
+	bar.sync 	0;
+
+BB16_16:
+	setp.lt.u32	%p10, %r14, 128;
+	@%p10 bra 	BB16_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB16_19;
+
+	ld.shared.f64 	%fd32, [%r13+512];
+	min.f64 	%fd6, %fd6, %fd32;
+	st.shared.f64 	[%r13], %fd6;
+
+BB16_19:
+	bar.sync 	0;
+
+BB16_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB16_33;
+
+	setp.lt.u32	%p13, %r14, 64;
+	@%p13 bra 	BB16_23;
+
+	ld.volatile.shared.f64 	%fd33, [%r13+256];
+	min.f64 	%fd6, %fd6, %fd33;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB16_23:
+	setp.lt.u32	%p14, %r14, 32;
+	@%p14 bra 	BB16_25;
+
+	ld.volatile.shared.f64 	%fd34, [%r13+128];
+	min.f64 	%fd6, %fd6, %fd34;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB16_25:
+	setp.lt.u32	%p15, %r14, 16;
+	@%p15 bra 	BB16_27;
+
+	ld.volatile.shared.f64 	%fd35, [%r13+64];
+	min.f64 	%fd6, %fd6, %fd35;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB16_27:
+	setp.lt.u32	%p16, %r14, 8;
+	@%p16 bra 	BB16_29;
+
+	ld.volatile.shared.f64 	%fd36, [%r13+32];
+	min.f64 	%fd6, %fd6, %fd36;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB16_29:
+	setp.lt.u32	%p17, %r14, 4;
+	@%p17 bra 	BB16_31;
+
+	ld.volatile.shared.f64 	%fd37, [%r13+16];
+	min.f64 	%fd6, %fd6, %fd37;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB16_31:
+	setp.lt.u32	%p18, %r14, 2;
+	@%p18 bra 	BB16_33;
+
+	ld.volatile.shared.f64 	%fd38, [%r13+8];
+	min.f64 	%fd39, %fd6, %fd38;
+	st.volatile.shared.f64 	[%r13], %fd39;
+
+BB16_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB16_35;
+
+	ld.shared.f64 	%fd40, [memory];
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.u32 	%rd7, %r6, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f64 	[%rd8], %fd40;
+
+BB16_35:
+	ret;
+}
+
+	// .globl	reduce_row_min_f
+.visible .entry reduce_row_min_f(
+	.param .u64 reduce_row_min_f_param_0,
+	.param .u64 reduce_row_min_f_param_1,
+	.param .u32 reduce_row_min_f_param_2,
+	.param .u32 reduce_row_min_f_param_3
+)
+{
+	.reg .pred 	%p<20>;
+	.reg .f32 	%f<56>;
+	.reg .b32 	%r<72>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [reduce_row_min_f_param_0];
+	ld.param.u64 	%rd2, [reduce_row_min_f_param_1];
+	ld.param.u32 	%r5, [reduce_row_min_f_param_2];
+	ld.param.u32 	%r4, [reduce_row_min_f_param_3];
+	mov.u32 	%r6, %ctaid.x;
+	setp.ge.u32	%p1, %r6, %r5;
+	@%p1 bra 	BB17_35;
+
+	mov.u32 	%r71, %tid.x;
+	mov.f32 	%f6, 0f7F7FFFFF;
+	setp.ge.u32	%p2, %r71, %r4;
+	@%p2 bra 	BB17_4;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+
+BB17_3:
+	mad.lo.s32 	%r8, %r6, %r4, %r71;
+	mul.wide.u32 	%rd4, %r8, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f28, [%rd5];
+	min.f32 	%f6, %f6, %f28;
+	mov.u32 	%r9, %ntid.x;
+	add.s32 	%r71, %r9, %r71;
+	setp.lt.u32	%p3, %r71, %r4;
+	@%p3 bra 	BB17_3;
+
+BB17_4:
+	mov.u32 	%r10, %tid.x;
+	shl.b32 	%r11, %r10, 2;
+	mov.u32 	%r12, memory;
+	add.s32 	%r13, %r12, %r11;
+	st.shared.f32 	[%r13], %f6;
+	bar.sync 	0;
+	mov.u32 	%r14, %ntid.x;
+	setp.lt.u32	%p4, %r14, 1024;
+	@%p4 bra 	BB17_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB17_7;
+
+	ld.shared.f32 	%f29, [%r13+2048];
+	min.f32 	%f6, %f6, %f29;
+	st.shared.f32 	[%r13], %f6;
+
+BB17_7:
+	bar.sync 	0;
+
+BB17_8:
+	setp.lt.u32	%p6, %r14, 512;
+	@%p6 bra 	BB17_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB17_11;
+
+	ld.shared.f32 	%f30, [%r13+1024];
+	min.f32 	%f6, %f6, %f30;
+	st.shared.f32 	[%r13], %f6;
+
+BB17_11:
+	bar.sync 	0;
+
+BB17_12:
+	setp.lt.u32	%p8, %r14, 256;
+	@%p8 bra 	BB17_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB17_15;
+
+	ld.shared.f32 	%f31, [%r13+512];
+	min.f32 	%f6, %f6, %f31;
+	st.shared.f32 	[%r13], %f6;
+
+BB17_15:
+	bar.sync 	0;
+
+BB17_16:
+	setp.lt.u32	%p10, %r14, 128;
+	@%p10 bra 	BB17_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB17_19;
+
+	ld.shared.f32 	%f32, [%r13+256];
+	min.f32 	%f6, %f6, %f32;
+	st.shared.f32 	[%r13], %f6;
+
+BB17_19:
+	bar.sync 	0;
+
+BB17_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB17_33;
+
+	setp.lt.u32	%p13, %r14, 64;
+	@%p13 bra 	BB17_23;
+
+	ld.volatile.shared.f32 	%f33, [%r13+128];
+	min.f32 	%f6, %f6, %f33;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB17_23:
+	setp.lt.u32	%p14, %r14, 32;
+	@%p14 bra 	BB17_25;
+
+	ld.volatile.shared.f32 	%f34, [%r13+64];
+	min.f32 	%f6, %f6, %f34;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB17_25:
+	setp.lt.u32	%p15, %r14, 16;
+	@%p15 bra 	BB17_27;
+
+	ld.volatile.shared.f32 	%f35, [%r13+32];
+	min.f32 	%f6, %f6, %f35;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB17_27:
+	setp.lt.u32	%p16, %r14, 8;
+	@%p16 bra 	BB17_29;
+
+	ld.volatile.shared.f32 	%f36, [%r13+16];
+	min.f32 	%f6, %f6, %f36;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB17_29:
+	setp.lt.u32	%p17, %r14, 4;
+	@%p17 bra 	BB17_31;
+
+	ld.volatile.shared.f32 	%f37, [%r13+8];
+	min.f32 	%f6, %f6, %f37;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB17_31:
+	setp.lt.u32	%p18, %r14, 2;
+	@%p18 bra 	BB17_33;
+
+	ld.volatile.shared.f32 	%f38, [%r13+4];
+	min.f32 	%f39, %f6, %f38;
+	st.volatile.shared.f32 	[%r13], %f39;
+
+BB17_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB17_35;
+
+	ld.shared.f32 	%f40, [memory];
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.u32 	%rd7, %r6, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f40;
+
+BB17_35:
+	ret;
+}
+
+	// .globl	reduce_col_min_d
+.visible .entry reduce_col_min_d(
+	.param .u64 reduce_col_min_d_param_0,
+	.param .u64 reduce_col_min_d_param_1,
+	.param .u32 reduce_col_min_d_param_2,
+	.param .u32 reduce_col_min_d_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<11>;
+	.reg .f64 	%fd<9>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd2, [reduce_col_min_d_param_0];
+	ld.param.u64 	%rd3, [reduce_col_min_d_param_1];
+	ld.param.u32 	%r5, [reduce_col_min_d_param_2];
+	ld.param.u32 	%r6, [reduce_col_min_d_param_3];
+	mov.u32 	%r7, %ntid.x;
+	mov.u32 	%r8, %ctaid.x;
+	mov.u32 	%r9, %tid.x;
+	mad.lo.s32 	%r1, %r7, %r8, %r9;
+	setp.ge.u32	%p1, %r1, %r6;
+	@%p1 bra 	BB18_5;
+
+	mul.lo.s32 	%r2, %r6, %r5;
+	cvta.to.global.u64 	%rd1, %rd2;
+	mov.f64 	%fd8, 0d7FEFFFFFFFFFFFFF;
+	setp.ge.u32	%p2, %r1, %r2;
+	@%p2 bra 	BB18_4;
+
+	mov.u32 	%r10, %r1;
+
+BB18_3:
+	mul.wide.u32 	%rd4, %r10, 8;
+	add.s64 	%rd5, %rd1, %rd4;
+	ld.global.f64 	%fd6, [%rd5];
+	min.f64 	%fd8, %fd8, %fd6;
+	add.s32 	%r10, %r10, %r6;
+	setp.lt.u32	%p3, %r10, %r2;
+	@%p3 bra 	BB18_3;
+
+BB18_4:
+	cvta.to.global.u64 	%rd6, %rd3;
+	mul.wide.u32 	%rd7, %r1, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f64 	[%rd8], %fd8;
+
+BB18_5:
+	ret;
+}
+
+	// .globl	reduce_col_min_f
+.visible .entry reduce_col_min_f(
+	.param .u64 reduce_col_min_f_param_0,
+	.param .u64 reduce_col_min_f_param_1,
+	.param .u32 reduce_col_min_f_param_2,
+	.param .u32 reduce_col_min_f_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<9>;
+	.reg .b32 	%r<11>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd2, [reduce_col_min_f_param_0];
+	ld.param.u64 	%rd3, [reduce_col_min_f_param_1];
+	ld.param.u32 	%r5, [reduce_col_min_f_param_2];
+	ld.param.u32 	%r6, [reduce_col_min_f_param_3];
+	mov.u32 	%r7, %ntid.x;
+	mov.u32 	%r8, %ctaid.x;
+	mov.u32 	%r9, %tid.x;
+	mad.lo.s32 	%r1, %r7, %r8, %r9;
+	setp.ge.u32	%p1, %r1, %r6;
+	@%p1 bra 	BB19_5;
+
+	mul.lo.s32 	%r2, %r6, %r5;
+	cvta.to.global.u64 	%rd1, %rd2;
+	mov.f32 	%f8, 0f7F7FFFFF;
+	setp.ge.u32	%p2, %r1, %r2;
+	@%p2 bra 	BB19_4;
+
+	mov.u32 	%r10, %r1;
+
+BB19_3:
+	mul.wide.u32 	%rd4, %r10, 4;
+	add.s64 	%rd5, %rd1, %rd4;
+	ld.global.f32 	%f6, [%rd5];
+	min.f32 	%f8, %f8, %f6;
+	add.s32 	%r10, %r10, %r6;
+	setp.lt.u32	%p3, %r10, %r2;
+	@%p3 bra 	BB19_3;
+
+BB19_4:
+	cvta.to.global.u64 	%rd6, %rd3;
+	mul.wide.u32 	%rd7, %r1, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f8;
+
+BB19_5:
+	ret;
+}
+
+	// .globl	reduce_sum_sq_d
+.visible .entry reduce_sum_sq_d(
+	.param .u64 reduce_sum_sq_d_param_0,
+	.param .u64 reduce_sum_sq_d_param_1,
+	.param .u32 reduce_sum_sq_d_param_2
+)
+{
+	.local .align 16 .b8 	__local_depot20[16];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<21>;
+	.reg .b32 	%r<39>;
+	.reg .f64 	%fd<61>;
+	.reg .b64 	%rd<16>;
+
+
+	mov.u64 	%SPL, __local_depot20;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd1, [reduce_sum_sq_d_param_0];
+	ld.param.u64 	%rd2, [reduce_sum_sq_d_param_1];
+	ld.param.u32 	%r6, [reduce_sum_sq_d_param_2];
+	mov.u32 	%r7, %ctaid.x;
+	shl.b32 	%r8, %r7, 1;
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %tid.x;
+	mad.lo.s32 	%r38, %r8, %r9, %r10;
+	mov.f64 	%fd45, 0d0000000000000000;
+	setp.ge.u32	%p1, %r38, %r6;
+	@%p1 bra 	BB20_4;
+
+BB20_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.u32 	%rd4, %r38, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd30, [%rd5];
+	fma.rn.f64 	%fd45, %fd30, %fd30, %fd45;
+	add.s32 	%r3, %r38, %r9;
+	setp.ge.u32	%p2, %r3, %r6;
+	@%p2 bra 	BB20_3;
+
+	mul.wide.u32 	%rd7, %r3, 8;
+	add.s64 	%rd8, %rd3, %rd7;
+	ld.global.f64 	%fd31, [%rd8];
+	fma.rn.f64 	%fd45, %fd31, %fd31, %fd45;
+
+BB20_3:
+	shl.b32 	%r13, %r9, 1;
+	mov.u32 	%r14, %nctaid.x;
+	mad.lo.s32 	%r38, %r13, %r14, %r38;
+	setp.lt.u32	%p3, %r38, %r6;
+	@%p3 bra 	BB20_1;
+
+BB20_4:
+	shl.b32 	%r16, %r10, 3;
+	mov.u32 	%r17, memory;
+	add.s32 	%r5, %r17, %r16;
+	st.shared.f64 	[%r5], %fd45;
+	bar.sync 	0;
+	setp.lt.u32	%p4, %r9, 1024;
+	@%p4 bra 	BB20_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB20_7;
+
+	ld.shared.f64 	%fd32, [%r5+4096];
+	fma.rn.f64 	%fd45, %fd32, %fd32, %fd45;
+	st.shared.f64 	[%r5], %fd45;
+
+BB20_7:
+	bar.sync 	0;
+
+BB20_8:
+	setp.lt.u32	%p6, %r9, 512;
+	@%p6 bra 	BB20_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB20_11;
+
+	ld.shared.f64 	%fd33, [%r5+2048];
+	fma.rn.f64 	%fd45, %fd33, %fd33, %fd45;
+	st.shared.f64 	[%r5], %fd45;
+
+BB20_11:
+	bar.sync 	0;
+
+BB20_12:
+	setp.lt.u32	%p8, %r9, 256;
+	@%p8 bra 	BB20_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB20_15;
+
+	ld.shared.f64 	%fd34, [%r5+1024];
+	fma.rn.f64 	%fd45, %fd34, %fd34, %fd45;
+	st.shared.f64 	[%r5], %fd45;
+
+BB20_15:
+	bar.sync 	0;
+
+BB20_16:
+	setp.lt.u32	%p10, %r9, 128;
+	@%p10 bra 	BB20_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB20_19;
+
+	ld.shared.f64 	%fd35, [%r5+512];
+	fma.rn.f64 	%fd45, %fd35, %fd35, %fd45;
+	st.shared.f64 	[%r5], %fd45;
+
+BB20_19:
+	bar.sync 	0;
+
+BB20_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB20_33;
+
+	setp.lt.u32	%p13, %r9, 64;
+	@%p13 bra 	BB20_23;
+
+	ld.volatile.shared.f64 	%fd36, [%r5+256];
+	fma.rn.f64 	%fd45, %fd36, %fd36, %fd45;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB20_23:
+	setp.lt.u32	%p14, %r9, 32;
+	@%p14 bra 	BB20_25;
+
+	ld.volatile.shared.f64 	%fd37, [%r5+128];
+	fma.rn.f64 	%fd45, %fd37, %fd37, %fd45;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB20_25:
+	setp.lt.u32	%p15, %r9, 16;
+	@%p15 bra 	BB20_27;
+
+	ld.volatile.shared.f64 	%fd38, [%r5+64];
+	fma.rn.f64 	%fd45, %fd38, %fd38, %fd45;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB20_27:
+	setp.lt.u32	%p16, %r9, 8;
+	@%p16 bra 	BB20_29;
+
+	ld.volatile.shared.f64 	%fd39, [%r5+32];
+	fma.rn.f64 	%fd45, %fd39, %fd39, %fd45;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB20_29:
+	setp.lt.u32	%p17, %r9, 4;
+	@%p17 bra 	BB20_31;
+
+	ld.volatile.shared.f64 	%fd40, [%r5+16];
+	fma.rn.f64 	%fd45, %fd40, %fd40, %fd45;
+	st.volatile.shared.f64 	[%r5], %fd45;
+
+BB20_31:
+	setp.lt.u32	%p18, %r9, 2;
+	@%p18 bra 	BB20_33;
+
+	ld.volatile.shared.f64 	%fd41, [%r5+8];
+	fma.rn.f64 	%fd42, %fd41, %fd41, %fd45;
+	st.volatile.shared.f64 	[%r5], %fd42;
+
+BB20_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB20_37;
+
+	mov.u32 	%r34, %nctaid.x;
+	setp.gt.u32	%p20, %r34, 9;
+	@%p20 bra 	BB20_36;
+
+	ld.shared.f64 	%fd43, [memory];
+	add.u64 	%rd9, %SP, 0;
+	add.u64 	%rd10, %SPL, 0;
+	st.local.u32 	[%rd10], %r7;
+	st.local.f64 	[%rd10+8], %fd43;
+	mov.u64 	%rd11, $str;
+	cvta.global.u64 	%rd12, %rd11;
+	// Callseq Start 6
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd12;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd9;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r36, [retval0+0];
+	
+	//{
+	}// Callseq End 6
+
+BB20_36:
+	ld.shared.f64 	%fd44, [memory];
+	cvta.to.global.u64 	%rd13, %rd2;
+	mul.wide.u32 	%rd14, %r7, 8;
+	add.s64 	%rd15, %rd13, %rd14;
+	st.global.f64 	[%rd15], %fd44;
+
+BB20_37:
+	ret;
+}
+
+	// .globl	reduce_sum_sq_f
+.visible .entry reduce_sum_sq_f(
+	.param .u64 reduce_sum_sq_f_param_0,
+	.param .u64 reduce_sum_sq_f_param_1,
+	.param .u32 reduce_sum_sq_f_param_2
+)
+{
+	.local .align 16 .b8 	__local_depot21[16];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<21>;
+	.reg .f32 	%f<61>;
+	.reg .b32 	%r<39>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<16>;
+
+
+	mov.u64 	%SPL, __local_depot21;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u64 	%rd1, [reduce_sum_sq_f_param_0];
+	ld.param.u64 	%rd2, [reduce_sum_sq_f_param_1];
+	ld.param.u32 	%r6, [reduce_sum_sq_f_param_2];
+	mov.u32 	%r7, %ctaid.x;
+	shl.b32 	%r8, %r7, 1;
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %tid.x;
+	mad.lo.s32 	%r38, %r8, %r9, %r10;
+	mov.f32 	%f45, 0f00000000;
+	setp.ge.u32	%p1, %r38, %r6;
+	@%p1 bra 	BB21_4;
+
+BB21_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.u32 	%rd4, %r38, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f30, [%rd5];
+	fma.rn.f32 	%f45, %f30, %f30, %f45;
+	add.s32 	%r3, %r38, %r9;
+	setp.ge.u32	%p2, %r3, %r6;
+	@%p2 bra 	BB21_3;
+
+	mul.wide.u32 	%rd7, %r3, 4;
+	add.s64 	%rd8, %rd3, %rd7;
+	ld.global.f32 	%f31, [%rd8];
+	fma.rn.f32 	%f45, %f31, %f31, %f45;
+
+BB21_3:
+	shl.b32 	%r13, %r9, 1;
+	mov.u32 	%r14, %nctaid.x;
+	mad.lo.s32 	%r38, %r13, %r14, %r38;
+	setp.lt.u32	%p3, %r38, %r6;
+	@%p3 bra 	BB21_1;
+
+BB21_4:
+	shl.b32 	%r16, %r10, 2;
+	mov.u32 	%r17, memory;
+	add.s32 	%r5, %r17, %r16;
+	st.shared.f32 	[%r5], %f45;
+	bar.sync 	0;
+	setp.lt.u32	%p4, %r9, 1024;
+	@%p4 bra 	BB21_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB21_7;
+
+	ld.shared.f32 	%f32, [%r5+2048];
+	fma.rn.f32 	%f45, %f32, %f32, %f45;
+	st.shared.f32 	[%r5], %f45;
+
+BB21_7:
+	bar.sync 	0;
+
+BB21_8:
+	setp.lt.u32	%p6, %r9, 512;
+	@%p6 bra 	BB21_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB21_11;
+
+	ld.shared.f32 	%f33, [%r5+1024];
+	fma.rn.f32 	%f45, %f33, %f33, %f45;
+	st.shared.f32 	[%r5], %f45;
+
+BB21_11:
+	bar.sync 	0;
+
+BB21_12:
+	setp.lt.u32	%p8, %r9, 256;
+	@%p8 bra 	BB21_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB21_15;
+
+	ld.shared.f32 	%f34, [%r5+512];
+	fma.rn.f32 	%f45, %f34, %f34, %f45;
+	st.shared.f32 	[%r5], %f45;
+
+BB21_15:
+	bar.sync 	0;
+
+BB21_16:
+	setp.lt.u32	%p10, %r9, 128;
+	@%p10 bra 	BB21_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB21_19;
+
+	ld.shared.f32 	%f35, [%r5+256];
+	fma.rn.f32 	%f45, %f35, %f35, %f45;
+	st.shared.f32 	[%r5], %f45;
+
+BB21_19:
+	bar.sync 	0;
+
+BB21_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB21_33;
+
+	setp.lt.u32	%p13, %r9, 64;
+	@%p13 bra 	BB21_23;
+
+	ld.volatile.shared.f32 	%f36, [%r5+128];
+	fma.rn.f32 	%f45, %f36, %f36, %f45;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB21_23:
+	setp.lt.u32	%p14, %r9, 32;
+	@%p14 bra 	BB21_25;
+
+	ld.volatile.shared.f32 	%f37, [%r5+64];
+	fma.rn.f32 	%f45, %f37, %f37, %f45;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB21_25:
+	setp.lt.u32	%p15, %r9, 16;
+	@%p15 bra 	BB21_27;
+
+	ld.volatile.shared.f32 	%f38, [%r5+32];
+	fma.rn.f32 	%f45, %f38, %f38, %f45;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB21_27:
+	setp.lt.u32	%p16, %r9, 8;
+	@%p16 bra 	BB21_29;
+
+	ld.volatile.shared.f32 	%f39, [%r5+16];
+	fma.rn.f32 	%f45, %f39, %f39, %f45;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB21_29:
+	setp.lt.u32	%p17, %r9, 4;
+	@%p17 bra 	BB21_31;
+
+	ld.volatile.shared.f32 	%f40, [%r5+8];
+	fma.rn.f32 	%f45, %f40, %f40, %f45;
+	st.volatile.shared.f32 	[%r5], %f45;
+
+BB21_31:
+	setp.lt.u32	%p18, %r9, 2;
+	@%p18 bra 	BB21_33;
+
+	ld.volatile.shared.f32 	%f41, [%r5+4];
+	fma.rn.f32 	%f42, %f41, %f41, %f45;
+	st.volatile.shared.f32 	[%r5], %f42;
+
+BB21_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB21_37;
+
+	mov.u32 	%r34, %nctaid.x;
+	setp.gt.u32	%p20, %r34, 9;
+	@%p20 bra 	BB21_36;
+
+	ld.shared.f32 	%f43, [memory];
+	cvt.f64.f32	%fd1, %f43;
+	add.u64 	%rd9, %SP, 0;
+	add.u64 	%rd10, %SPL, 0;
+	st.local.u32 	[%rd10], %r7;
+	st.local.f64 	[%rd10+8], %fd1;
+	mov.u64 	%rd11, $str;
+	cvta.global.u64 	%rd12, %rd11;
+	// Callseq Start 7
+	{
+	.reg .b32 temp_param_reg;
+	// <end>}
+	.param .b64 param0;
+	st.param.b64	[param0+0], %rd12;
+	.param .b64 param1;
+	st.param.b64	[param1+0], %rd9;
+	.param .b32 retval0;
+	call.uni (retval0), 
+	vprintf, 
+	(
+	param0, 
+	param1
+	);
+	ld.param.b32	%r36, [retval0+0];
+	
+	//{
+	}// Callseq End 7
+
+BB21_36:
+	ld.shared.f32 	%f44, [memory];
+	cvta.to.global.u64 	%rd13, %rd2;
+	mul.wide.u32 	%rd14, %r7, 4;
+	add.s64 	%rd15, %rd13, %rd14;
+	st.global.f32 	[%rd15], %f44;
+
+BB21_37:
+	ret;
+}
+
+	// .globl	reduce_col_sum_sq_d
+.visible .entry reduce_col_sum_sq_d(
+	.param .u64 reduce_col_sum_sq_d_param_0,
+	.param .u64 reduce_col_sum_sq_d_param_1,
+	.param .u32 reduce_col_sum_sq_d_param_2,
+	.param .u32 reduce_col_sum_sq_d_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<11>;
+	.reg .f64 	%fd<9>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd2, [reduce_col_sum_sq_d_param_0];
+	ld.param.u64 	%rd3, [reduce_col_sum_sq_d_param_1];
+	ld.param.u32 	%r5, [reduce_col_sum_sq_d_param_2];
+	ld.param.u32 	%r6, [reduce_col_sum_sq_d_param_3];
+	mov.u32 	%r7, %ntid.x;
+	mov.u32 	%r8, %ctaid.x;
+	mov.u32 	%r9, %tid.x;
+	mad.lo.s32 	%r1, %r7, %r8, %r9;
+	setp.ge.u32	%p1, %r1, %r6;
+	@%p1 bra 	BB22_5;
+
+	mul.lo.s32 	%r2, %r6, %r5;
+	cvta.to.global.u64 	%rd1, %rd2;
+	mov.f64 	%fd8, 0d0000000000000000;
+	setp.ge.u32	%p2, %r1, %r2;
+	@%p2 bra 	BB22_4;
+
+	mov.u32 	%r10, %r1;
+
+BB22_3:
+	mul.wide.u32 	%rd4, %r10, 8;
+	add.s64 	%rd5, %rd1, %rd4;
+	ld.global.f64 	%fd6, [%rd5];
+	fma.rn.f64 	%fd8, %fd6, %fd6, %fd8;
+	add.s32 	%r10, %r10, %r6;
+	setp.lt.u32	%p3, %r10, %r2;
+	@%p3 bra 	BB22_3;
+
+BB22_4:
+	cvta.to.global.u64 	%rd6, %rd3;
+	mul.wide.u32 	%rd7, %r1, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f64 	[%rd8], %fd8;
+
+BB22_5:
+	ret;
+}
+
+	// .globl	reduce_col_sum_sq_f
+.visible .entry reduce_col_sum_sq_f(
+	.param .u64 reduce_col_sum_sq_f_param_0,
+	.param .u64 reduce_col_sum_sq_f_param_1,
+	.param .u32 reduce_col_sum_sq_f_param_2,
+	.param .u32 reduce_col_sum_sq_f_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<9>;
+	.reg .b32 	%r<11>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd2, [reduce_col_sum_sq_f_param_0];
+	ld.param.u64 	%rd3, [reduce_col_sum_sq_f_param_1];
+	ld.param.u32 	%r5, [reduce_col_sum_sq_f_param_2];
+	ld.param.u32 	%r6, [reduce_col_sum_sq_f_param_3];
+	mov.u32 	%r7, %ntid.x;
+	mov.u32 	%r8, %ctaid.x;
+	mov.u32 	%r9, %tid.x;
+	mad.lo.s32 	%r1, %r7, %r8, %r9;
+	setp.ge.u32	%p1, %r1, %r6;
+	@%p1 bra 	BB23_5;
+
+	mul.lo.s32 	%r2, %r6, %r5;
+	cvta.to.global.u64 	%rd1, %rd2;
+	mov.f32 	%f8, 0f00000000;
+	setp.ge.u32	%p2, %r1, %r2;
+	@%p2 bra 	BB23_4;
+
+	mov.u32 	%r10, %r1;
+
+BB23_3:
+	mul.wide.u32 	%rd4, %r10, 4;
+	add.s64 	%rd5, %rd1, %rd4;
+	ld.global.f32 	%f6, [%rd5];
+	fma.rn.f32 	%f8, %f6, %f6, %f8;
+	add.s32 	%r10, %r10, %r6;
+	setp.lt.u32	%p3, %r10, %r2;
+	@%p3 bra 	BB23_3;
+
+BB23_4:
+	cvta.to.global.u64 	%rd6, %rd3;
+	mul.wide.u32 	%rd7, %r1, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f8;
+
+BB23_5:
+	ret;
+}
+
+	// .globl	reduce_row_sum_sq_d
+.visible .entry reduce_row_sum_sq_d(
+	.param .u64 reduce_row_sum_sq_d_param_0,
+	.param .u64 reduce_row_sum_sq_d_param_1,
+	.param .u32 reduce_row_sum_sq_d_param_2,
+	.param .u32 reduce_row_sum_sq_d_param_3
+)
+{
+	.reg .pred 	%p<20>;
+	.reg .b32 	%r<72>;
+	.reg .f64 	%fd<56>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [reduce_row_sum_sq_d_param_0];
+	ld.param.u64 	%rd2, [reduce_row_sum_sq_d_param_1];
+	ld.param.u32 	%r5, [reduce_row_sum_sq_d_param_2];
+	ld.param.u32 	%r4, [reduce_row_sum_sq_d_param_3];
+	mov.u32 	%r6, %ctaid.x;
+	setp.ge.u32	%p1, %r6, %r5;
+	@%p1 bra 	BB24_35;
+
+	mov.u32 	%r71, %tid.x;
+	mov.f64 	%fd6, 0d0000000000000000;
+	setp.ge.u32	%p2, %r71, %r4;
+	@%p2 bra 	BB24_4;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+
+BB24_3:
+	mad.lo.s32 	%r8, %r6, %r4, %r71;
+	mul.wide.u32 	%rd4, %r8, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd28, [%rd5];
+	fma.rn.f64 	%fd6, %fd28, %fd28, %fd6;
+	mov.u32 	%r9, %ntid.x;
+	add.s32 	%r71, %r9, %r71;
+	setp.lt.u32	%p3, %r71, %r4;
+	@%p3 bra 	BB24_3;
+
+BB24_4:
+	mov.u32 	%r10, %tid.x;
+	shl.b32 	%r11, %r10, 3;
+	mov.u32 	%r12, memory;
+	add.s32 	%r13, %r12, %r11;
+	st.shared.f64 	[%r13], %fd6;
+	bar.sync 	0;
+	mov.u32 	%r14, %ntid.x;
+	setp.lt.u32	%p4, %r14, 1024;
+	@%p4 bra 	BB24_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB24_7;
+
+	ld.shared.f64 	%fd29, [%r13+4096];
+	fma.rn.f64 	%fd6, %fd29, %fd29, %fd6;
+	st.shared.f64 	[%r13], %fd6;
+
+BB24_7:
+	bar.sync 	0;
+
+BB24_8:
+	setp.lt.u32	%p6, %r14, 512;
+	@%p6 bra 	BB24_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB24_11;
+
+	ld.shared.f64 	%fd30, [%r13+2048];
+	fma.rn.f64 	%fd6, %fd30, %fd30, %fd6;
+	st.shared.f64 	[%r13], %fd6;
+
+BB24_11:
+	bar.sync 	0;
+
+BB24_12:
+	setp.lt.u32	%p8, %r14, 256;
+	@%p8 bra 	BB24_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB24_15;
+
+	ld.shared.f64 	%fd31, [%r13+1024];
+	fma.rn.f64 	%fd6, %fd31, %fd31, %fd6;
+	st.shared.f64 	[%r13], %fd6;
+
+BB24_15:
+	bar.sync 	0;
+
+BB24_16:
+	setp.lt.u32	%p10, %r14, 128;
+	@%p10 bra 	BB24_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB24_19;
+
+	ld.shared.f64 	%fd32, [%r13+512];
+	fma.rn.f64 	%fd6, %fd32, %fd32, %fd6;
+	st.shared.f64 	[%r13], %fd6;
+
+BB24_19:
+	bar.sync 	0;
+
+BB24_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB24_33;
+
+	setp.lt.u32	%p13, %r14, 64;
+	@%p13 bra 	BB24_23;
+
+	ld.volatile.shared.f64 	%fd33, [%r13+256];
+	fma.rn.f64 	%fd6, %fd33, %fd33, %fd6;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB24_23:
+	setp.lt.u32	%p14, %r14, 32;
+	@%p14 bra 	BB24_25;
+
+	ld.volatile.shared.f64 	%fd34, [%r13+128];
+	fma.rn.f64 	%fd6, %fd34, %fd34, %fd6;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB24_25:
+	setp.lt.u32	%p15, %r14, 16;
+	@%p15 bra 	BB24_27;
+
+	ld.volatile.shared.f64 	%fd35, [%r13+64];
+	fma.rn.f64 	%fd6, %fd35, %fd35, %fd6;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB24_27:
+	setp.lt.u32	%p16, %r14, 8;
+	@%p16 bra 	BB24_29;
+
+	ld.volatile.shared.f64 	%fd36, [%r13+32];
+	fma.rn.f64 	%fd6, %fd36, %fd36, %fd6;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB24_29:
+	setp.lt.u32	%p17, %r14, 4;
+	@%p17 bra 	BB24_31;
+
+	ld.volatile.shared.f64 	%fd37, [%r13+16];
+	fma.rn.f64 	%fd6, %fd37, %fd37, %fd6;
+	st.volatile.shared.f64 	[%r13], %fd6;
+
+BB24_31:
+	setp.lt.u32	%p18, %r14, 2;
+	@%p18 bra 	BB24_33;
+
+	ld.volatile.shared.f64 	%fd38, [%r13+8];
+	fma.rn.f64 	%fd39, %fd38, %fd38, %fd6;
+	st.volatile.shared.f64 	[%r13], %fd39;
+
+BB24_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB24_35;
+
+	ld.shared.f64 	%fd40, [memory];
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.u32 	%rd7, %r6, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f64 	[%rd8], %fd40;
+
+BB24_35:
+	ret;
+}
+
+	// .globl	reduce_row_sum_sq_f
+.visible .entry reduce_row_sum_sq_f(
+	.param .u64 reduce_row_sum_sq_f_param_0,
+	.param .u64 reduce_row_sum_sq_f_param_1,
+	.param .u32 reduce_row_sum_sq_f_param_2,
+	.param .u32 reduce_row_sum_sq_f_param_3
+)
+{
+	.reg .pred 	%p<20>;
+	.reg .f32 	%f<56>;
+	.reg .b32 	%r<72>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [reduce_row_sum_sq_f_param_0];
+	ld.param.u64 	%rd2, [reduce_row_sum_sq_f_param_1];
+	ld.param.u32 	%r5, [reduce_row_sum_sq_f_param_2];
+	ld.param.u32 	%r4, [reduce_row_sum_sq_f_param_3];
+	mov.u32 	%r6, %ctaid.x;
+	setp.ge.u32	%p1, %r6, %r5;
+	@%p1 bra 	BB25_35;
+
+	mov.u32 	%r71, %tid.x;
+	mov.f32 	%f6, 0f00000000;
+	setp.ge.u32	%p2, %r71, %r4;
+	@%p2 bra 	BB25_4;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+
+BB25_3:
+	mad.lo.s32 	%r8, %r6, %r4, %r71;
+	mul.wide.u32 	%rd4, %r8, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f28, [%rd5];
+	fma.rn.f32 	%f6, %f28, %f28, %f6;
+	mov.u32 	%r9, %ntid.x;
+	add.s32 	%r71, %r9, %r71;
+	setp.lt.u32	%p3, %r71, %r4;
+	@%p3 bra 	BB25_3;
+
+BB25_4:
+	mov.u32 	%r10, %tid.x;
+	shl.b32 	%r11, %r10, 2;
+	mov.u32 	%r12, memory;
+	add.s32 	%r13, %r12, %r11;
+	st.shared.f32 	[%r13], %f6;
+	bar.sync 	0;
+	mov.u32 	%r14, %ntid.x;
+	setp.lt.u32	%p4, %r14, 1024;
+	@%p4 bra 	BB25_8;
+
+	setp.gt.u32	%p5, %r10, 511;
+	@%p5 bra 	BB25_7;
+
+	ld.shared.f32 	%f29, [%r13+2048];
+	fma.rn.f32 	%f6, %f29, %f29, %f6;
+	st.shared.f32 	[%r13], %f6;
+
+BB25_7:
+	bar.sync 	0;
+
+BB25_8:
+	setp.lt.u32	%p6, %r14, 512;
+	@%p6 bra 	BB25_12;
+
+	setp.gt.u32	%p7, %r10, 255;
+	@%p7 bra 	BB25_11;
+
+	ld.shared.f32 	%f30, [%r13+1024];
+	fma.rn.f32 	%f6, %f30, %f30, %f6;
+	st.shared.f32 	[%r13], %f6;
+
+BB25_11:
+	bar.sync 	0;
+
+BB25_12:
+	setp.lt.u32	%p8, %r14, 256;
+	@%p8 bra 	BB25_16;
+
+	setp.gt.u32	%p9, %r10, 127;
+	@%p9 bra 	BB25_15;
+
+	ld.shared.f32 	%f31, [%r13+512];
+	fma.rn.f32 	%f6, %f31, %f31, %f6;
+	st.shared.f32 	[%r13], %f6;
+
+BB25_15:
+	bar.sync 	0;
+
+BB25_16:
+	setp.lt.u32	%p10, %r14, 128;
+	@%p10 bra 	BB25_20;
+
+	setp.gt.u32	%p11, %r10, 63;
+	@%p11 bra 	BB25_19;
+
+	ld.shared.f32 	%f32, [%r13+256];
+	fma.rn.f32 	%f6, %f32, %f32, %f6;
+	st.shared.f32 	[%r13], %f6;
+
+BB25_19:
+	bar.sync 	0;
+
+BB25_20:
+	setp.gt.u32	%p12, %r10, 31;
+	@%p12 bra 	BB25_33;
+
+	setp.lt.u32	%p13, %r14, 64;
+	@%p13 bra 	BB25_23;
+
+	ld.volatile.shared.f32 	%f33, [%r13+128];
+	fma.rn.f32 	%f6, %f33, %f33, %f6;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB25_23:
+	setp.lt.u32	%p14, %r14, 32;
+	@%p14 bra 	BB25_25;
+
+	ld.volatile.shared.f32 	%f34, [%r13+64];
+	fma.rn.f32 	%f6, %f34, %f34, %f6;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB25_25:
+	setp.lt.u32	%p15, %r14, 16;
+	@%p15 bra 	BB25_27;
+
+	ld.volatile.shared.f32 	%f35, [%r13+32];
+	fma.rn.f32 	%f6, %f35, %f35, %f6;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB25_27:
+	setp.lt.u32	%p16, %r14, 8;
+	@%p16 bra 	BB25_29;
+
+	ld.volatile.shared.f32 	%f36, [%r13+16];
+	fma.rn.f32 	%f6, %f36, %f36, %f6;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB25_29:
+	setp.lt.u32	%p17, %r14, 4;
+	@%p17 bra 	BB25_31;
+
+	ld.volatile.shared.f32 	%f37, [%r13+8];
+	fma.rn.f32 	%f6, %f37, %f37, %f6;
+	st.volatile.shared.f32 	[%r13], %f6;
+
+BB25_31:
+	setp.lt.u32	%p18, %r14, 2;
+	@%p18 bra 	BB25_33;
+
+	ld.volatile.shared.f32 	%f38, [%r13+4];
+	fma.rn.f32 	%f39, %f38, %f38, %f6;
+	st.volatile.shared.f32 	[%r13], %f39;
+
+BB25_33:
+	setp.ne.s32	%p19, %r10, 0;
+	@%p19 bra 	BB25_35;
+
+	ld.shared.f32 	%f40, [memory];
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.u32 	%rd7, %r6, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f40;
+
+BB25_35:
+	ret;
+}
+
+
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
new file mode 100644
index 0000000..36299c7
--- /dev/null
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "SpoofCUDAContext.h"
+
+#include <filesystem>
+#include <iostream>
+#include <cstdlib>
+#include <sstream>
+
+size_t SpoofCUDAContext::initialize_cuda(uint32_t device_id, const char* resource_path) {
+
+#ifdef __DEBUG
+	std::cout << "initializing cuda device " << device_id << std::endl;
+#endif
+
+  SpoofCUDAContext *ctx = new SpoofCUDAContext(resource_path);
+  // cuda device is handled by jCuda atm
+  //cudaSetDevice(device_id);
+  //cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+  //cudaDeviceSynchronize();
+
+  CHECK_CUDA(cuModuleLoad(&(ctx->reductions), std::string(ctx->resource_path + std::string("/cuda/kernels/reduction.ptx")).c_str()));
+
+  CUfunction func;
+
+  // ToDo: implement a more scalable solution for these imports
+
+  // SUM
+  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_d"));
+  ctx->reduction_kernels.insert(std::make_pair("reduce_sum_d", func));
+  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_f"));
+  ctx->reduction_kernels.insert(std::make_pair("reduce_sum_f", func));
+
+  // SUM_SQ
+  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_sq_d"));
+  ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_d", func));
+  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_sq_f"));
+  ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_f", func));
+
+  // MIN
+  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_min_d"));
+  ctx->reduction_kernels.insert(std::make_pair("reduce_min_d", func));
+  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_min_f"));
+  ctx->reduction_kernels.insert(std::make_pair("reduce_min_f", func));
+
+  // MAX
+  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_d"));
+  ctx->reduction_kernels.insert(std::make_pair("reduce_max_d", func));
+  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_f"));
+  ctx->reduction_kernels.insert(std::make_pair("reduce_max_f", func));
+
+  return reinterpret_cast<size_t>(ctx);
+}
+
+void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id) {
+  delete ctx;
+  ctx = nullptr;
+  // cuda device is handled by jCuda atm
+  //cudaDeviceReset();
+}
+
+bool SpoofCUDAContext::compile_cuda(const std::string &src,
+                                    const std::string &name) {
+    std::string cuda_include_path("");
+    char* cdp = std::getenv("CUDA_PATH");
+    if(cdp != nullptr)
+        cuda_include_path = std::string("-I") + std::string(cdp) + "/include";
+    else {
+    	std::cout << "Warning: CUDA_PATH environment variable not set. Using default include path"
+    			"/usr/local/cuda/include" << std::endl;
+    	cuda_include_path = std::string("-I/usr/local/cuda/include");
+    }
+
+#ifdef __DEBUG
+  std::cout << "compiling cuda kernel " << name << std::endl;
+  std::cout << src << std::endl;
+  std::cout << "cwd: " << std::filesystem::current_path() << std::endl;
+  std::cout << "cuda_path: " << cuda_include_path << std::endl;
+#endif
+
+  SpoofOperator::AggType type = SpoofOperator::AggType::NONE;
+  SpoofOperator::AggOp op = SpoofOperator::AggOp::NONE;
+
+  auto pos = 0;
+  if((pos = src.find("CellType")) != std::string::npos) {
+      if(src.substr(pos, pos+30).find("FULL_AGG") != std::string::npos)
+          type = SpoofOperator::AggType::FULL_AGG;
+      else if(src.substr(pos, pos+30).find("ROW_AGG") != std::string::npos)
+          type = SpoofOperator::AggType::ROW_AGG;
+      else if(src.substr(pos, pos+30).find("COL_AGG") != std::string::npos)
+          type = SpoofOperator::AggType::COL_AGG;
+      else if(src.substr(pos, pos+30).find("NO_AGG") != std::string::npos)
+          type = SpoofOperator::AggType::NO_AGG;
+      else {
+          std::cerr << "error: unknown aggregation type" << std::endl;
+          return false;
+      }
+
+      if(type != SpoofOperator::AggType::NO_AGG) {
+          if((pos = src.find("AggOp")) != std::string::npos) {
+              if(src.substr(pos, pos+30).find("AggOp.SUM") != std::string::npos)
+                  op = SpoofOperator::AggOp::SUM;
+              else if(src.substr(pos, pos+30).find("AggOp.SUM_SQ") != std::string::npos)
+                  op = SpoofOperator::AggOp::SUM_SQ;
+              else if(src.substr(pos, pos+30).find("AggOp.MIN") != std::string::npos)
+                  op = SpoofOperator::AggOp::MIN;
+              else if(src.substr(pos, pos+30).find("AggOp.MAX") != std::string::npos)
+                  op = SpoofOperator::AggOp::MAX;
+              else {
+                std::cerr << "error: unknown aggregation operator" << std::endl;
+                return false;
+              }
+          }
+      }
+  }
+
+  std::stringstream s1, s2, s3;
+  s1 << "-I" << resource_path << "/cuda/headers";
+  s2 << "-I" << resource_path << "/cuda/spoof";
+
+  jitify::Program program = kernel_cache.program(src, 0, {s1.str(), s2.str(), cuda_include_path});
+  ops.insert(std::make_pair(name, SpoofOperator({std::move(program), type, op})));
+  return true;
+}
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
new file mode 100644
index 0000000..36d29ec
--- /dev/null
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+#ifndef SPOOFCUDACONTEXT_H
+#define SPOOFCUDACONTEXT_H
+
+#include <cmath>
+#include <cstdint>
+#include <map>
+#include <string>
+
+#ifdef __DEBUG
+    #define JITIFY_PRINT_ALL 1
+#endif
+
+#include <jitify.hpp>
+
+#include "host_utils.h"
+
+using jitify::reflection::type_of;
+
+struct SpoofOperator {
+  enum class AggType : int { NO_AGG, ROW_AGG, COL_AGG, FULL_AGG, NONE };
+  enum class AggOp : int {SUM, SUM_SQ, MIN, MAX, NONE };
+
+  jitify::Program program;
+  AggType agg_type;
+  AggOp agg_op;
+
+};
+
+class SpoofCUDAContext {
+
+  jitify::JitCache kernel_cache;
+  std::map<const std::string, SpoofOperator> ops;
+  CUmodule reductions;
+  std::map<const std::string, CUfunction> reduction_kernels;
+
+public:
+  // ToDo: make launch config more adaptive
+  // num threads
+  const int NT = 256;
+
+  // values / thread
+  const int VT = 4;
+
+  const std::string resource_path;
+
+  SpoofCUDAContext(const char* resource_path_) : reductions(nullptr), resource_path(resource_path_) {}
+
+  static size_t initialize_cuda(uint32_t device_id, const char* resource_path_);
+
+  static void destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id);
+
+  bool compile_cuda(const std::string &src, const std::string &name);
+
+  template <typename T>
+  T execute_kernel(const std::string &name, T **in_ptrs, int num_inputs,
+                   T **side_ptrs, int num_sides, T *out_ptr, T *scalars_ptr,
+                   int num_scalars, int m, int n, int grix) {
+
+    T result = 0.0;
+    size_t dev_buf_size;
+    T **d_sides = nullptr;
+    T *d_scalars = nullptr;
+    T *d_temp_agg_buf;
+    uint32_t N = m * n;
+
+    auto o = ops.find(name);
+    if (o != ops.end()) {
+      SpoofOperator *op = &(o->second);
+
+      if (num_sides > 0) {
+        dev_buf_size = sizeof(T *) * num_sides;
+        CHECK_CUDART(cudaMalloc((void **)&d_sides, dev_buf_size));
+        CHECK_CUDART(cudaMemcpy(d_sides, side_ptrs, dev_buf_size, cudaMemcpyHostToDevice));
+      }
+
+      if (num_scalars > 0) {
+        dev_buf_size = sizeof(T) * num_scalars;
+        CHECK_CUDART(cudaMalloc((void **)&d_scalars, dev_buf_size));
+        CHECK_CUDART(cudaMemcpy(d_scalars, scalars_ptr, dev_buf_size, cudaMemcpyHostToDevice));
+      }
+
+      switch (op->agg_type) {
+          case SpoofOperator::AggType::FULL_AGG: {
+            // num ctas
+            int NB = std::ceil((N + NT * 2 - 1) / (NT * 2));
+            dim3 grid(NB, 1, 1);
+            dim3 block(NT, 1, 1);
+            unsigned int shared_mem_size = NT * sizeof(T);
+
+            dev_buf_size = sizeof(T) * NB;
+            CHECK_CUDART(cudaMalloc((void **)&d_temp_agg_buf, dev_buf_size));
+#ifdef __DEBUG
+            // ToDo: connect output to SystemDS logging facilities
+            std::cout << "launching spoof cellwise kernel " << name << " with "
+                      << NT * NB << " threads in " << NB << " blocks and "
+                      << shared_mem_size
+                      << " bytes of shared memory for full aggregation of "
+                      << N << " elements"
+                      << std::endl;
+#endif
+            CHECK_CUDA(op->program.kernel(name)
+                .instantiate(type_of(result))
+                .configure(grid, block, shared_mem_size)
+                .launch(in_ptrs[0], d_sides, d_temp_agg_buf, d_scalars, m, n, grix));
+
+            if(NB > 1) {
+                std::string reduction_kernel_name = determine_agg_kernel<T>(op);
+
+                CUfunction reduce_kernel = reduction_kernels.find(reduction_kernel_name)->second;
+                N = NB;
+                int iter = 1;
+                while (NB > 1) {
+                    void* args[3] = { &d_temp_agg_buf, &d_temp_agg_buf, &N};
+
+                    NB = std::ceil((N + NT * 2 - 1) / (NT * 2));
+#ifdef __DEBUG
+                    std::cout << "agg iter " << iter++ << " launching spoof cellwise kernel " << name << " with "
+                    << NT * NB << " threads in " << NB << " blocks and "
+                    << shared_mem_size
+                    << " bytes of shared memory for full aggregation of "
+                    << N << " elements"
+                    << std::endl;
+#endif
+                    CHECK_CUDA(cuLaunchKernel(reduce_kernel, 
+                        NB, 1, 1, 
+                        NT, 1, 1,
+                        shared_mem_size, 0, args, 0));
+                    N = NB;
+                }
+            }
+                            
+            CHECK_CUDART(cudaMemcpy(&result, d_temp_agg_buf, sizeof(T), cudaMemcpyDeviceToHost));
+            CHECK_CUDART(cudaFree(d_temp_agg_buf));
+            break;
+          }
+          case SpoofOperator::AggType::COL_AGG: {
+              // num ctas
+              int NB = std::ceil((N + NT - 1) / NT);
+              dim3 grid(NB, 1, 1);
+              dim3 block(NT, 1, 1);
+              unsigned int shared_mem_size = 0;
+#ifdef __DEBUG
+              std::cout << " launching spoof cellwise kernel " << name << " with "
+                  << NT * NB << " threads in " << NB << " blocks for column aggregation of "
+                  << N << " elements" << std::endl;
+#endif
+              CHECK_CUDA(op->program.kernel(name)
+                  .instantiate(type_of(result))
+                  .configure(grid, block)
+                  .launch(in_ptrs[0], d_sides, out_ptr, d_scalars, m, n, grix));
+
+              break;
+          }
+          case SpoofOperator::AggType::ROW_AGG: {
+              // num ctas
+              int NB = m;
+              dim3 grid(NB, 1, 1);
+              dim3 block(NT, 1, 1);
+              unsigned int shared_mem_size = NT * sizeof(T);
+
+#ifdef __DEBUG
+              std::cout << " launching spoof cellwise kernel " << name << " with "
+                  << NT * NB << " threads in " << NB << " blocks and "
+                  << shared_mem_size << " bytes of shared memory for row aggregation of "
+                  << N << " elements" << std::endl;
+#endif
+              CHECK_CUDA(op->program.kernel(name)
+                  .instantiate(type_of(result))
+                  .configure(grid, block, shared_mem_size)
+                  .launch(in_ptrs[0], d_sides, out_ptr, d_scalars, m, n, grix));
+
+              break;
+          }
+          case SpoofOperator::AggType::NO_AGG: 
+          default: {
+            // num ctas
+              // ToDo: VT not a template parameter anymore
+            int NB = std::ceil((N + NT * VT - 1) / (NT * VT));
+            dim3 grid(NB, 1, 1);
+            dim3 block(NT, 1, 1);
+#ifdef __DEBUG
+            std::cout << "launching spoof cellwise kernel " << name << " with " << NT * NB
+                      << " threads in " << NB << " blocks without aggregation for " 
+                      << N << " elements"
+                      << std::endl;
+#endif
+            CHECK_CUDA(op->program.kernel(name)
+                .instantiate(type_of(result))
+                .configure(grid, block)
+                .launch(in_ptrs[0], d_sides, out_ptr, d_scalars, m, n, grix));
+          }
+      }
+      
+      if (num_scalars > 0)
+        CHECK_CUDART(cudaFree(d_scalars));
+
+      if (num_sides > 0)
+        CHECK_CUDART(cudaFree(d_sides));
+    } 
+    else {
+      std::cerr << "kernel " << name << " not found." << std::endl;
+      return result;
+    }
+    return result;
+  }
+
+  template<typename T>
+  std::string determine_agg_kernel(SpoofOperator* op) {
+      std::string reduction_kernel_name;
+      std::string reduction_type;
+      std::string suffix = (typeid(T) == typeid(double) ? "_d" : "_f");
+      switch (op->agg_type) {
+      case SpoofOperator::AggType::FULL_AGG:
+          reduction_type = "_";
+          break;
+      case SpoofOperator::AggType::ROW_AGG:
+          reduction_type = "_row_";
+          break;
+      case SpoofOperator::AggType::COL_AGG:
+          reduction_type = "_col_";
+          break;
+      default:
+          std::cerr << "unknown reduction type" << std::endl;
+          return "";
+      }
+    
+      switch (op->agg_op) {
+      case SpoofOperator::AggOp::MIN:
+          reduction_kernel_name = "reduce" + reduction_type + "min" + suffix;
+          break;
+      case SpoofOperator::AggOp::MAX:
+          reduction_kernel_name = "reduce" + reduction_type + "max" + suffix;
+          break;
+      case SpoofOperator::AggOp::SUM_SQ:
+          reduction_kernel_name = "reduce" + reduction_type + "sum_sq" + suffix;
+          break;
+      case SpoofOperator::AggOp::SUM:
+          reduction_kernel_name = "reduce" + reduction_type + "sum" + suffix;
+          break;
+      default:
+          std::cerr << "unknown reduction op" << std::endl;
+          return "";
+      }
+
+      return reduction_kernel_name;
+  }
+};
+
+#endif // SPOOFCUDACONTEXT_H
diff --git a/src/main/cuda/spoof-launcher/host_utils.h b/src/main/cuda/spoof-launcher/host_utils.h
new file mode 100644
index 0000000..47990ad
--- /dev/null
+++ b/src/main/cuda/spoof-launcher/host_utils.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+#ifndef HOST_UTILS_H
+#define HOST_UTILS_H
+
+#include <cuda.h>
+
+#define CHECK_CUDA(call)                                                  \
+  do {                                                                    \
+    CUresult status = call;                                               \
+    if (status != CUDA_SUCCESS) {                                         \
+      const char* str;                                                    \
+      cuGetErrorName(status, &str);                                       \
+      std::cout << "(CUDA) returned " << str;                             \
+      std::cout << " (" << __FILE__ << ":" << __LINE__ << ":" << __func__ \
+                << "())" << std::endl;                                    \
+    }                                                                     \
+  } while (0)
+
+#define CHECK_CUDART(call)                                                \
+  do {                                                                    \
+    cudaError_t status = call;                                            \
+    if (status != cudaSuccess) {                                          \
+      std::cout << "(CUDART) returned " << cudaGetErrorString(status);    \
+      std::cout << " (" << __FILE__ << ":" << __LINE__ << ":" << __func__ \
+                << "())" << std::endl;                                    \
+    }                                                                     \
+  } while (0)
+
+#endif // HOST_UTILS_H
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.cpp b/src/main/cuda/spoof-launcher/jni_bridge.cpp
new file mode 100644
index 0000000..6645003
--- /dev/null
+++ b/src/main/cuda/spoof-launcher/jni_bridge.cpp
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "jni_bridge.h"
+#include "SpoofCUDAContext.h"
+
+// JNI Methods to get/release arrays
+#define GET_ARRAY(env, input)                                                  \
+  ((void *)env->GetPrimitiveArrayCritical(input, nullptr))
+
+#define RELEASE_ARRAY(env, java, cpp)                                                  \
+  (env->ReleasePrimitiveArrayCritical(java, cpp, 0))
+
+JNIEXPORT jlong JNICALL
+Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context(
+    JNIEnv *env, jobject jobj, jint device_id, jstring resource_path) {
+
+  const char *cstr_rp = env->GetStringUTFChars(resource_path, NULL);
+  size_t ctx = SpoofCUDAContext::initialize_cuda(device_id, cstr_rp);
+  env->ReleaseStringUTFChars(resource_path, cstr_rp);
+  return ctx;
+}
+
+JNIEXPORT void JNICALL
+Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
+    JNIEnv *env, jobject jobj, jlong ctx, jint device_id) {
+  SpoofCUDAContext::destroy_cuda(reinterpret_cast<SpoofCUDAContext *>(ctx),
+                                 device_id);
+}
+
+JNIEXPORT jboolean JNICALL
+Java_org_apache_sysds_hops_codegen_SpoofCompiler_compile_1cuda_1kernel(
+    JNIEnv *env, jobject jobj, jlong ctx, jstring name, jstring src) {
+  SpoofCUDAContext *ctx_ = reinterpret_cast<SpoofCUDAContext *>(ctx);
+  const char *cstr_name = env->GetStringUTFChars(name, NULL);
+  const char *cstr_src = env->GetStringUTFChars(src, NULL);
+  bool result = ctx_->compile_cuda(cstr_src, cstr_name);
+  env->ReleaseStringUTFChars(src, cstr_src);
+  env->ReleaseStringUTFChars(name, cstr_name);
+  return result;
+}
+
+JNIEXPORT jdouble JNICALL
+Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1d(
+    JNIEnv *env, jobject jobj, jlong ctx, jstring name, jlongArray in_ptrs,
+    jlongArray side_ptrs, jlong out_ptr, jdoubleArray scalars_, jlong m, jlong n, jlong grix) {
+
+  SpoofCUDAContext *ctx_ = reinterpret_cast<SpoofCUDAContext *>(ctx);
+  const char *cstr_name = env->GetStringUTFChars(name, NULL);
+
+  double **inputs = reinterpret_cast<double **>(GET_ARRAY(env, in_ptrs));
+  double **sides = reinterpret_cast<double **>(GET_ARRAY(env, side_ptrs));
+  double *scalars = reinterpret_cast<double *>(GET_ARRAY(env, scalars_));
+
+  double result = ctx_->execute_kernel(
+      cstr_name, inputs, env->GetArrayLength(in_ptrs), sides, env->GetArrayLength(side_ptrs),
+      reinterpret_cast<double*>(out_ptr), scalars, env->GetArrayLength(scalars_), m, n, grix);
+
+  RELEASE_ARRAY(env, in_ptrs, inputs);
+  RELEASE_ARRAY(env, side_ptrs, sides);
+  RELEASE_ARRAY(env, scalars_, scalars);
+
+  // FIXME: that release causes an error
+  //std::cout << "releasing " << name_ << std::endl;
+  env->ReleaseStringUTFChars(name, cstr_name);
+  return result;
+}
+
+JNIEXPORT jfloat JNICALL
+Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1f(
+    JNIEnv *env, jobject jobj, jlong ctx, jstring name, jlongArray in_ptrs,
+    jlongArray side_ptrs, jlong out_ptr, jfloatArray scalars_, jlong m, jlong n, jlong grix) {
+
+  SpoofCUDAContext *ctx_ = reinterpret_cast<SpoofCUDAContext *>(ctx);
+
+  const char *cstr_name = env->GetStringUTFChars(name, NULL);
+
+  float **inputs = reinterpret_cast<float**>(GET_ARRAY(env, in_ptrs));
+  float **sides = reinterpret_cast<float **>(GET_ARRAY(env, side_ptrs));
+  float *scalars = reinterpret_cast<float *>(GET_ARRAY(env, scalars_));
+
+  float result = ctx_->execute_kernel(
+      cstr_name, inputs, env->GetArrayLength(in_ptrs), sides, env->GetArrayLength(side_ptrs),
+      reinterpret_cast<float *>(out_ptr), scalars, env->GetArrayLength(scalars_), m, n, grix);
+
+  RELEASE_ARRAY(env, in_ptrs, inputs);
+  RELEASE_ARRAY(env, side_ptrs, sides);
+  RELEASE_ARRAY(env, scalars_, scalars);
+
+  // FIXME: that release causes an error
+  env->ReleaseStringUTFChars(name, cstr_name);
+  return result;
+}
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.h b/src/main/cuda/spoof-launcher/jni_bridge.h
new file mode 100644
index 0000000..a06bb1b
--- /dev/null
+++ b/src/main/cuda/spoof-launcher/jni_bridge.h
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/* DO NOT EDIT THIS FILE - it is machine generated */
+
+#pragma once
+#ifndef JNI_BRIDGE_H
+#define JNI_BRIDGE_H
+
+#include <jni.h>
+/* Header for class org_apache_sysds_hops_codegen_SpoofCompiler */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class:     org_apache_sysds_hops_codegen_SpoofCompiler
+ * Method:    initialize_cuda_context
+ * Signature: (I)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context(
+    JNIEnv *, jobject, jint, jstring);
+
+/*
+ * Class:     org_apache_sysds_hops_codegen_SpoofCompiler
+ * Method:    destroy_cuda_context
+ * Signature: (JI)V
+ */
+JNIEXPORT void JNICALL
+Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
+    JNIEnv *, jobject, jlong, jint);
+
+/*
+ * Class:     org_apache_sysds_hops_codegen_SpoofCompiler
+ * Method:    compile_cuda_kernel
+ * Signature: (JLjava/lang/String;Ljava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_org_apache_sysds_hops_codegen_SpoofCompiler_compile_1cuda_1kernel(
+    JNIEnv *, jobject, jlong, jstring, jstring);
+
+/*
+ * Class:     org_apache_sysds_runtime_instructions_gpu_SpoofCUDAInstruction
+ * Method:    execute_d
+ * Signature: (...)Z
+ */
+JNIEXPORT jdouble JNICALL
+Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1d(
+    JNIEnv *, jobject, jlong, jstring, jlongArray, jlongArray, jlong, jdoubleArray, jlong, jlong, jlong);
+
+/*
+ * Class:     org_apache_sysds_runtime_instructions_gpu_SpoofCUDAInstruction
+ * Method:    execute_f
+ * Signature: (...)Z
+ */
+JNIEXPORT jfloat JNICALL
+Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1f(
+    JNIEnv *, jobject, jlong, jstring, jlongArray, jlongArray, jlong, jfloatArray, jlong, jlong, jlong);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // JNI_BRIDGE_H
diff --git a/src/main/cuda/spoof/cellwise.cu b/src/main/cuda/spoof/cellwise.cu
new file mode 100644
index 0000000..2f76802
--- /dev/null
+++ b/src/main/cuda/spoof/cellwise.cu
@@ -0,0 +1,54 @@
+%TMP%
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// CellType: %TYPE%
+// AggOp: %AGG_OP_NAME%
+// SparseSafe: %SPARSE_SAFE%
+// SEQ: %SEQ%
+
+#include "agg_ops.cuh"
+#include "reduction.cuh"
+#include "spoof_utils.cuh"
+#include "utils.cuh"
+
+template<typename T>
+struct SpoofCellwiseOp {
+   T**b; T* scalars; 
+   int m, n, grix_;
+
+   SpoofCellwiseOp(T** b, T* scalars, int m, int n, int grix) : 
+       b(b), scalars(scalars), m(m), n(n), grix_(grix) {}
+
+   __device__  __forceinline__ T operator()(T a, int idx) const {
+        int rix = idx / n;
+        int cix = idx % n;
+        int grix = grix_ + rix;
+%BODY_dense%
+        return %OUT%;
+   }
+};
+
+template<typename T>
+__global__ void %TMP% (T *a, T** b, T* c, T* scalars, int m, int n, int grix) {
+   %AGG_OP%<T> agg_op;
+   SpoofCellwiseOp<T> spoof_op(b, scalars, m, n, grix);
+   %TYPE%<T, %AGG_OP%<T>, SpoofCellwiseOp<T>>(a, c, m, n, %INITIAL_VALUE%, agg_op, spoof_op);
+};
diff --git a/src/main/cuda/spoof/functions.cuh b/src/main/cuda/spoof/functions.cuh
new file mode 100644
index 0000000..55f3ee3
--- /dev/null
+++ b/src/main/cuda/spoof/functions.cuh
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+__constant__ double DOUBLE_EPS = 1.11022E-16; // 2 ^ -53
+__constant__ double FLOAT_EPS = 1.49012E-08; // 2 ^ -26
+__constant__ double EPSILON = 1E-11; // margin for comparisons ToDo: make consistent use of it  
+
+__device__ unsigned long long toUInt64(double a) {
+    return (signbit(a) == 0 ? 1.0 : -1.0) * abs(floor(a + DOUBLE_EPS));
+}
+
+__device__ unsigned int toUInt32(float a) {
+    return (signbit(a) == 0 ? 1.0 : -1.0) * abs(floor(a + FLOAT_EPS));
+}
+
+template<typename T>
+__device__ T getValue(T* data, int rowIndex) {
+    return data[rowIndex];
+}
+
+template<typename T>
+__device__ T getValue(T* data, int n, int rowIndex, int colIndex) {
+    return data[rowIndex * n + colIndex];
+}
+
+template<typename T>
+__device__ T intDiv(T a, T b);
+
+template<>
+__device__ double intDiv(double a, double b) {
+    double ret = a / b;
+    return (isnan(ret) || isinf(ret)) ? ret : toUInt64(ret);
+}
+
+template<>
+__device__ float intDiv(float a, float b) {
+    float ret = a / b;
+    return (isnan(ret) || isinf(ret)) ? ret : toUInt32(ret);
+}
+
+template<typename T>
+__device__ T modulus(T a, T b);
+
+template<>
+__device__ double modulus(double a, double b) {
+    if (fabs(b) < DOUBLE_EPS)
+        return CUDART_NAN;
+    return a - intDiv(a, b) * b;
+}
+
+template<>
+__device__ float modulus(float a, float b) {
+    if (fabs(b) < FLOAT_EPS)
+        return CUDART_NAN_F;
+    return a - intDiv(a, b) * b;
+}
+
+template<typename T>
+__device__ T bwAnd(T a, T b);
+
+// ToDo: does not work with long long
+template<>
+__device__ double bwAnd(double a, double b) {
+    return (*reinterpret_cast<long*>(&a)) & (*reinterpret_cast<long*>(&b));
+}
+
+template<>
+__device__ float bwAnd(float a, float b) {
+    return (*reinterpret_cast<int*>(&a)) & (*reinterpret_cast<int*>(&b));
+}
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/api/DMLScript.java b/src/main/java/org/apache/sysds/api/DMLScript.java
index 758fde6..d32731f 100644
--- a/src/main/java/org/apache/sysds/api/DMLScript.java
+++ b/src/main/java/org/apache/sysds/api/DMLScript.java
@@ -45,6 +45,7 @@ import org.apache.sysds.conf.CompilerConfig;
 import org.apache.sysds.conf.ConfigurationManager;
 import org.apache.sysds.conf.DMLConfig;
 import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.lops.Lop;
 import org.apache.sysds.parser.DMLProgram;
 import org.apache.sysds.parser.DMLTranslator;
@@ -97,7 +98,7 @@ public class DMLScript
 	public static boolean     LINEAGE_DEDUP = DMLOptions.defaultOptions.lineage_dedup;     // whether deduplicate lineage items
 	public static ReuseCacheType LINEAGE_REUSE = DMLOptions.defaultOptions.linReuseType;   // whether lineage-based reuse
 	public static LineageCachePolicy LINEAGE_POLICY = DMLOptions.defaultOptions.linCachePolicy; // lineage cache eviction policy
-	public static boolean     CHECK_PRIVACY = DMLOptions.defaultOptions.checkPrivacy;      // Check which privacy constraints are loaded and checked during federated execution 
+	public static boolean     CHECK_PRIVACY = DMLOptions.defaultOptions.checkPrivacy;      // Check which privacy constraints are loaded and checked during federated execution
 
 	public static boolean           USE_ACCELERATOR     = DMLOptions.defaultOptions.gpu;
 	public static boolean           FORCE_ACCELERATOR   = DMLOptions.defaultOptions.forceGPU;
@@ -179,7 +180,7 @@ public class DMLScript
 	 * @return true if success, false otherwise
 	 * @throws IOException If an internal IOException happens.
 	 */
-	public static boolean executeScript( Configuration conf, String[] args ) 
+	public static boolean executeScript( Configuration conf, String[] args )
 		throws  IOException, ParseException, DMLScriptException
 	{
 		//parse arguments and set execution properties
@@ -364,7 +365,10 @@ public class DMLScript
 		
 		//Step 1: parse configuration files & write any configuration specific global variables
 		loadConfiguration(fnameOptConfig);
-		
+
+		//Step 2: configure codegen
+		configureCodeGen();
+
 		//Step 3: parse dml script
 		Statistics.startCompileTimer();
 		ParserWrapper parser = ParserFactory.createParser();
@@ -416,7 +420,7 @@ public class DMLScript
 			cleanupHadoopExecution( ConfigurationManager.getDMLConfig());
 		}
 	}
-	
+
 	/**
 	 * Sets the global flags in DMLScript based on user provided configuration
 	 * 
@@ -493,8 +497,8 @@ public class DMLScript
 		
 		//0) cleanup federated workers if necessary
 		FederatedData.clearFederatedWorkers();
-		
-		//1) cleanup scratch space (everything for current uuid) 
+
+		//1) cleanup scratch space (everything for current uuid)
 		//(required otherwise export to hdfs would skip assumed unnecessary writes if same name)
 		HDFSTool.deleteFileIfExistOnHDFS( config.getTextValue(DMLConfig.SCRATCH_SPACE) + dirSuffix );
 		
@@ -560,7 +564,7 @@ public class DMLScript
 
 	/**
 	 * Print the error in a user friendly manner.
-	 * 
+	 *
 	 * @param e The exception thrown.
 	 */
 	public static void errorPrint(Exception e){
@@ -584,4 +588,19 @@ public class DMLScript
 		sb.append("\n" + ANSI_RESET);
 		System.out.println(sb.toString());
 	}
+
+	private static void configureCodeGen() {
+		// load native codegen if configured
+		if(ConfigurationManager.isCodegenEnabled()) {
+			SpoofCompiler.GeneratorAPI configured_generator = SpoofCompiler.GeneratorAPI.valueOf(ConfigurationManager.getDMLConfig().getTextValue(DMLConfig.CODEGEN_API).toUpperCase());
+			if(configured_generator != SpoofCompiler.GeneratorAPI.JAVA) {
+				try {
+					SpoofCompiler.loadNativeCodeGenerator(configured_generator);
+				}
+				catch(Exception e) {
+					LOG.error("Failed to load native cuda codegen library\n" + e);
+				}
+			}
+		}
+	}
 }
diff --git a/src/main/java/org/apache/sysds/conf/ConfigurationManager.java b/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
index fdc3602..516b956 100644
--- a/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
+++ b/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
@@ -60,10 +60,10 @@ public class ConfigurationManager
 	
 	
 	/**
-	 * Returns a cached JobConf object, intended for global use by all operations 
+	 * Returns a cached JobConf object, intended for global use by all operations
 	 * with read-only access to job conf. This prevents to read the hadoop conf files
-	 * over and over again from classpath. However, 
-	 * 
+	 * over and over again from classpath. However,
+	 *
 	 * @return the cached JobConf
 	 */
 	public static JobConf getCachedJobConf() {
@@ -177,11 +177,7 @@ public class ConfigurationManager
 	
 	public static boolean isCodegenEnabled() {
 		return (getDMLConfig().getBooleanValue(DMLConfig.CODEGEN)
-			|| getCompilerConfigFlag(ConfigType.CODEGEN_ENABLED))
-			&& !DMLScript.USE_ACCELERATOR;
-		//note: until codegen is supported for the GPU backend, we globally
-		//disable codegen if operations are forced to the GPU to avoid
-		//a counter-productive impact on performance.
+			|| getCompilerConfigFlag(ConfigType.CODEGEN_ENABLED));
 	}
 	
 	///////////////////////////////////////
diff --git a/src/main/java/org/apache/sysds/conf/DMLConfig.java b/src/main/java/org/apache/sysds/conf/DMLConfig.java
index 7bba416..9c1b65a 100644
--- a/src/main/java/org/apache/sysds/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysds/conf/DMLConfig.java
@@ -40,6 +40,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.sysds.hops.OptimizerUtils;
 import org.apache.sysds.hops.codegen.SpoofCompiler.CompilerType;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
 import org.apache.sysds.hops.codegen.SpoofCompiler.PlanSelector;
 import org.apache.sysds.lops.Compression;
 import org.apache.sysds.parser.ParseException;
@@ -72,6 +73,7 @@ public class DMLConfig
 	public static final String NATIVE_BLAS          = "sysds.native.blas";
 	public static final String NATIVE_BLAS_DIR      = "sysds.native.blas.directory";
 	public static final String CODEGEN              = "sysds.codegen.enabled"; //boolean
+	public static final String CODEGEN_API          = "sysds.codegen.api"; // see SpoofCompiler.API
 	public static final String CODEGEN_COMPILER     = "sysds.codegen.compiler"; //see SpoofCompiler.CompilerType
 	public static final String CODEGEN_OPTIMIZER    = "sysds.codegen.optimizer"; //see SpoofCompiler.PlanSelector
 	public static final String CODEGEN_PLANCACHE    = "sysds.codegen.plancache"; //boolean
@@ -118,6 +120,7 @@ public class DMLConfig
 		_defaultVals.put(COMPRESSED_LOSSY,       "false" );
 		_defaultVals.put(COMPRESSED_VALID_COMPRESSIONS, "DDC,OLE,RLE");
 		_defaultVals.put(CODEGEN,                "false" );
+		_defaultVals.put(CODEGEN_API,		     GeneratorAPI.JAVA.name() );
 		_defaultVals.put(CODEGEN_COMPILER,       CompilerType.AUTO.name() );
 		_defaultVals.put(CODEGEN_OPTIMIZER,      PlanSelector.FUSE_COST_BASED_V2.name() );
 		_defaultVals.put(CODEGEN_PLANCACHE,      "true" );
@@ -379,7 +382,7 @@ public class DMLConfig
 			LOCAL_TMP_DIR,SCRATCH_SPACE,OPTIMIZATION_LEVEL, DEFAULT_BLOCK_SIZE,
 			CP_PARALLEL_OPS, CP_PARALLEL_IO, NATIVE_BLAS, NATIVE_BLAS_DIR,
 			COMPRESSED_LINALG, COMPRESSED_LOSSY, COMPRESSED_VALID_COMPRESSIONS,
-			CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS,
+			CODEGEN, CODEGEN_API, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS,
 			STATS_MAX_WRAP_LEN, PRINT_GPU_MEMORY_INFO,
 			AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY, 
 			LOCAL_SPARK_NUM_THREADS, EVICTION_SHADOW_BUFFERSIZE, GPU_MEMORY_ALLOCATOR, GPU_MEMORY_UTILIZATION_FACTOR
diff --git a/src/main/java/org/apache/sysds/hops/codegen/SpoofCompiler.java b/src/main/java/org/apache/sysds/hops/codegen/SpoofCompiler.java
index 2d50056..d388583 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/SpoofCompiler.java
@@ -19,15 +19,16 @@
 
 package org.apache.sysds.hops.codegen;
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
 import java.util.Map.Entry;
+import java.util.jar.JarEntry;
+import java.util.jar.JarFile;
 
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.SystemUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
@@ -82,6 +83,7 @@ import org.apache.sysds.parser.WhileStatementBlock;
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.codegen.CodegenUtils;
+import org.apache.sysds.runtime.codegen.SpoofCUDA;
 import org.apache.sysds.runtime.codegen.SpoofCellwise.CellType;
 import org.apache.sysds.runtime.codegen.SpoofRowwise.RowType;
 import org.apache.sysds.runtime.controlprogram.BasicProgramBlock;
@@ -93,36 +95,48 @@ import org.apache.sysds.runtime.controlprogram.Program;
 import org.apache.sysds.runtime.controlprogram.ProgramBlock;
 import org.apache.sysds.runtime.controlprogram.WhileProgramBlock;
 import org.apache.sysds.runtime.instructions.Instruction;
+import org.apache.sysds.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysds.runtime.lineage.LineageItemUtils;
 import org.apache.sysds.runtime.matrix.data.Pair;
 import org.apache.sysds.utils.Explain;
+import org.apache.sysds.utils.NativeHelper;
 import org.apache.sysds.utils.Statistics;
 
-public class SpoofCompiler
-{
+public class SpoofCompiler {
 	private static final Log LOG = LogFactory.getLog(SpoofCompiler.class.getName());
-	
+
 	//internal configuration flags
-	public static CompilerType JAVA_COMPILER           = CompilerType.JANINO; 
-	public static PlanSelector PLAN_SEL_POLICY         = PlanSelector.FUSE_COST_BASED_V2; 
+	public static CompilerType JAVA_COMPILER           = CompilerType.JANINO;
+	public static PlanSelector PLAN_SEL_POLICY         = PlanSelector.FUSE_COST_BASED_V2;
 	public static final IntegrationType INTEGRATION    = IntegrationType.RUNTIME;
 	public static final boolean RECOMPILE_CODEGEN      = true;
 	public static final boolean PRUNE_REDUNDANT_PLANS  = true;
 	public static PlanCachePolicy PLAN_CACHE_POLICY    = PlanCachePolicy.CSLH;
 	public static final int PLAN_CACHE_SIZE            = 1024; //max 1K classes
 	public static final RegisterAlloc REG_ALLOC_POLICY = RegisterAlloc.EXACT_STATIC_BUFF;
-	
+	public static GeneratorAPI API = GeneratorAPI.JAVA;
+	public static HashMap<GeneratorAPI, Long> native_contexts;
+
 	public enum CompilerType {
 		AUTO,
 		JAVAC,
 		JANINO,
+		NVCC,
+		NVRTC
 	}
-	
+
+
+	public enum GeneratorAPI {
+		AUTO,
+		JAVA,
+		CUDA
+	}
+
 	public enum IntegrationType {
 		HOPS,
 		RUNTIME,
 	}
-	
+
 	public enum PlanSelector {
 		FUSE_ALL,             //maximal fusion, possible w/ redundant compute
 		FUSE_NO_REDUNDANCY,   //fusion without redundant compute 
@@ -143,18 +157,143 @@ public class SpoofCompiler
 		CONSTANT, //plan cache, with always compile literals
 		CSLH,     //plan cache, with context-sensitive literal replacement heuristic
 		NONE;     //no plan cache
-		
+
 		public static PlanCachePolicy get(boolean planCache, boolean compileLiterals) {
 			return !planCache ? NONE : compileLiterals ? CONSTANT : CSLH;
 		}
 	}
-	
+
 	public enum RegisterAlloc {
 		HEURISTIC,           //max vector intermediates, special handling pipelines (always safe)
 		EXACT_DYNAMIC_BUFF,  //min number of live vector intermediates, assuming dynamic pooling
 		EXACT_STATIC_BUFF,   //min number of live vector intermediates, assuming static array ring buffer
 	}
-	
+
+	@Override
+	protected void finalize() {
+			SpoofCompiler.cleanupCodeGenerator();
+	}
+
+	public static void loadNativeCodeGenerator(GeneratorAPI generator) {
+		if(DMLScript.getGlobalExecMode() == ExecMode.SPARK) {
+			LOG.warn("Not loading native codegen library in SPARK execution mode!\n");
+			return;
+		}
+
+		// loading cuda codegen (the only supported API atm)
+		if(generator == GeneratorAPI.AUTO && DMLScript.USE_ACCELERATOR)
+			generator = GeneratorAPI.CUDA;
+
+		if(generator == GeneratorAPI.CUDA && !DMLScript.USE_ACCELERATOR)
+			generator = GeneratorAPI.JAVA;
+
+		if(native_contexts == null)
+			native_contexts = new HashMap<>();
+
+		if(!native_contexts.containsKey(generator)) {
+			if(generator == GeneratorAPI.CUDA) {
+				// init GPUs with jCuda to avoid double initialization problems
+				GPUContextPool.initializeGPU();
+
+				String arch = SystemUtils.OS_ARCH;
+				String os = SystemUtils.OS_NAME;
+				String suffix = ".so";
+
+				if(SystemUtils.IS_OS_LINUX && SystemUtils.OS_ARCH.equalsIgnoreCase("amd64"))
+					arch = "x86_64";
+				if(SystemUtils.IS_OS_WINDOWS) {
+					os = "Windows";
+					suffix = ".dll";
+				}
+
+				String libName = "libsystemds_spoof_cuda-" + os + "-" + arch + suffix;
+
+				// ToDo: remove legacy paths
+				boolean isLoaded = NativeHelper.loadBLAS(System.getProperty("user.dir")
+					+ "/src/main/cpp/lib".replace("/",File.separator), libName, "");
+
+				if(!isLoaded)
+					isLoaded = NativeHelper.loadBLAS(System.getProperty("user.dir")
+						+ "/target/classes/lib".replace("/", File.separator), libName, "");
+				if(!isLoaded)
+					isLoaded = NativeHelper.loadBLAS(null, libName, "");
+				if(!isLoaded)
+					isLoaded = NativeHelper.loadLibraryHelperFromResource(libName);
+
+				if(isLoaded) {
+					String local_tmp = ConfigurationManager.getDMLConfig().getTextValue(DMLConfig.LOCAL_TMP_DIR);
+					String jar_path = SpoofCompiler.class.getProtectionDomain().getCodeSource().getLocation().getPath();
+					if(jar_path.contains(".jar")) {
+						try {
+							extractCodegenSources(local_tmp, jar_path);
+						}
+						catch (IOException e){
+							LOG.error("Could not extract spoof files from jar: " + e);
+							API = GeneratorAPI.JAVA;
+							return;
+						}
+					}
+					else {
+						local_tmp = System.getProperty("user.dir") + "/src/main".replace("/", File.separator);
+					}
+
+					long ctx_ptr = initialize_cuda_context(0, local_tmp);
+					if(ctx_ptr != 0) {
+						native_contexts.put(GeneratorAPI.CUDA, ctx_ptr);
+						API = GeneratorAPI.CUDA;
+						LOG.info("Successfully loaded spoof cuda library");
+					}
+					else {
+						API = GeneratorAPI.JAVA;
+						LOG.error("Failed to initialize spoof cuda context. Falling back to java codegen\n");
+					}
+				}
+				else {
+					API = GeneratorAPI.JAVA;
+					LOG.error("Loading of spoof native cuda failed. Falling back to java codegen\n");
+				}
+			}
+		}
+	}
+
+	public static void unloadNativeCodeGenerator() {
+		if(native_contexts.containsKey(GeneratorAPI.CUDA)) {
+			destroy_cuda_context(native_contexts.get(GeneratorAPI.CUDA), 0);
+			native_contexts.remove(GeneratorAPI.CUDA);
+			if(API == GeneratorAPI.CUDA)
+				API = GeneratorAPI.JAVA;
+		}
+	}
+
+	private static void extractCodegenSources(String resource_path, String jar_path) throws IOException {
+		JarFile jar_file = new JarFile(jar_path);
+		Enumeration<JarEntry> files_in_jar = jar_file.entries();
+
+		while (files_in_jar.hasMoreElements()) {
+			JarEntry in_file = files_in_jar.nextElement();
+			if (in_file.getName().startsWith("cuda/") && !in_file.isDirectory()) {
+				File out_file = new File(resource_path, in_file.getName());
+				out_file.deleteOnExit();
+				File parent = out_file.getParentFile();
+				if (parent != null) {
+					parent.mkdirs();
+					parent.deleteOnExit();
+				}
+				IOUtils.copy(jar_file.getInputStream(in_file), FileUtils.openOutputStream(out_file));
+			}
+		}
+	}
+
+	private static boolean compile_cuda(String name, String src) {
+		return compile_cuda_kernel(native_contexts.get(GeneratorAPI.CUDA), name, src);
+	}
+
+	private static native long initialize_cuda_context(int device_id, String resource_path);
+
+	private static native boolean compile_cuda_kernel(long ctx, String name, String src);
+
+	private static native void destroy_cuda_context(long ctx, int device_id);
+
 	//plan cache for cplan->compiled source to avoid unnecessary codegen/source code compile
 	//for equal operators from (1) different hop dags and (2) repeated recompilation 
 	//note: if PLAN_CACHE_SIZE is exceeded, we evict the least-recently-used plan (LRU policy)
@@ -370,9 +509,23 @@ public class SpoofCompiler
 				Class<?> cla = planCache.getPlan(tmp.getValue());
 				
 				if( cla == null ) {
-					//generate java source code
-					String src = tmp.getValue().codegen(false);
-					
+					String src = "";
+					boolean native_compiled_successfully = false;
+
+					if(API == GeneratorAPI.CUDA && tmp.getValue().isSupported(API)) {
+						src = tmp.getValue().codegen(false, GeneratorAPI.CUDA);
+						native_compiled_successfully = compile_cuda(tmp.getValue().getVarname(), src);
+						if (native_compiled_successfully)
+							CodegenUtils.putNativeOpData(new SpoofCUDA(tmp.getValue()));
+						else
+							LOG.warn("CUDA compilation failed, falling back to JAVA");
+					}
+
+					if(API == GeneratorAPI.JAVA || !native_compiled_successfully) {
+							src = tmp.getValue().codegen(false, GeneratorAPI.JAVA);
+							cla = CodegenUtils.compileClass("codegen."+ tmp.getValue().getClassname(), src);
+					}
+
 					//explain debug output cplans or generated source code
 					if( LOG.isTraceEnabled() || DMLScript.EXPLAIN.isHopsType(recompile) ) {
 						LOG.info("Codegen EXPLAIN (generated cplan for HopID: " + cplan.getKey() + 
@@ -385,11 +538,7 @@ public class SpoofCompiler
 							", line "+tmp.getValue().getBeginLine() + ", hash="+tmp.getValue().hashCode()+"):");
 						LOG.info(src);
 					}
-					
-					//compile generated java source code
-					cla = CodegenUtils.compileClass("codegen."+
-						tmp.getValue().getClassname(), src);
-					
+
 					//maintain plan cache
 					if( PLAN_CACHE_POLICY!=PlanCachePolicy.NONE )
 						planCache.putPlan(tmp.getValue(), cla);
@@ -399,7 +548,7 @@ public class SpoofCompiler
 				}
 				
 				//make class available and maintain hits
-				if(cla != null)
+				if(cla != null || API != GeneratorAPI.JAVA)
 					clas.put(cplan.getKey(), new Pair<Hop[],Class<?>>(tmp.getKey(),cla));
 				if( DMLScript.STATISTICS )
 					Statistics.incrementCodegenOpCacheTotal();
@@ -442,6 +591,10 @@ public class SpoofCompiler
 			CodegenUtils.clearClassCache(); //class cache
 			planCache.clear(); //plan cache
 		}
+
+		if(API != GeneratorAPI.JAVA)
+			unloadNativeCodeGenerator();
+
 	}
 	
 	/**
@@ -594,7 +747,7 @@ public class SpoofCompiler
 			CNodeTpl tmpCNode = cplans.get(hop.getHopID()).getValue();
 			
 			hnew = new SpoofFusedOp(hop.getName(), hop.getDataType(), hop.getValueType(),
-				tmpCla.getValue(), false, tmpCNode.getOutputDimType());
+				tmpCla.getValue(), tmpCNode.getGeneratorAPI(), tmpCNode.getVarname(), false, tmpCNode.getOutputDimType());
 			Hop[] inHops = tmpCla.getKey();
 			
 
diff --git a/src/main/java/org/apache/sysds/hops/codegen/SpoofFusedOp.java b/src/main/java/org/apache/sysds/hops/codegen/SpoofFusedOp.java
index 3aca219..598d956 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/SpoofFusedOp.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/SpoofFusedOp.java
@@ -21,6 +21,7 @@ package org.apache.sysds.hops.codegen;
 
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.common.Types.ValueType;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
 import org.apache.sysds.hops.Hop;
 import org.apache.sysds.hops.MemoTable;
 import org.apache.sysds.hops.MultiThreadedHop;
@@ -33,6 +34,7 @@ import org.apache.sysds.runtime.meta.DataCharacteristics;
 import org.apache.sysds.runtime.meta.MatrixCharacteristics;
 
 import java.util.ArrayList;
+import java.util.Objects;
 
 public class SpoofFusedOp extends MultiThreadedHop
 {
@@ -55,16 +57,21 @@ public class SpoofFusedOp extends MultiThreadedHop
 	private boolean _distSupported = false;
 	private long _constDim2 = -1;
 	private SpoofOutputDimsType _dimsType;
-	
+	private GeneratorAPI _api = GeneratorAPI.JAVA;
+	private String _genVarName;
+
 	public SpoofFusedOp ( ) {
 	
 	}
 	
-	public SpoofFusedOp( String name, DataType dt, ValueType vt, Class<?> cla, boolean dist, SpoofOutputDimsType type ) {
+	public SpoofFusedOp( String name, DataType dt, ValueType vt, Class<?> cla, GeneratorAPI api, String genVarName,
+						 boolean dist, SpoofOutputDimsType type ) {
 		super(name, dt, vt);
 		_class = cla;
 		_distSupported = dist;
 		_dimsType = type;
+		_api = api;
+		_genVarName = genVarName;
 	}
 
 	@Override
@@ -81,7 +88,10 @@ public class SpoofFusedOp extends MultiThreadedHop
 	
 	@Override
 	public boolean isGPUEnabled() {
-		return false;
+		if(_api == GeneratorAPI.CUDA)
+			return true;
+		else
+			return false;
 	}
 	
 	@Override
@@ -91,10 +101,13 @@ public class SpoofFusedOp extends MultiThreadedHop
 
 	@Override
 	protected double computeOutputMemEstimate(long dim1, long dim2, long nnz) {
-		return _class.getGenericSuperclass().equals(SpoofRowwise.class) ?
-			OptimizerUtils.estimateSize(dim1, dim2) :
-			OptimizerUtils.estimatePartitionedSizeExactSparsity(
-				dim1, dim2, getBlocksize(), nnz);
+		if(_api == GeneratorAPI.JAVA) {
+			return _class.getGenericSuperclass().equals(SpoofRowwise.class) ?
+					OptimizerUtils.estimateSize(dim1, dim2) :
+					OptimizerUtils.estimatePartitionedSizeExactSparsity(dim1, dim2, getBlocksize(), nnz);
+		}
+		else
+			return OptimizerUtils.estimatePartitionedSizeExactSparsity(dim1, dim2, getBlocksize(), nnz);
 	}
 
 	@Override
@@ -114,7 +127,7 @@ public class SpoofFusedOp extends MultiThreadedHop
 			inputs.add(c.constructLops());
 		
 		int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
-		SpoofFused lop = new SpoofFused(inputs, getDataType(), getValueType(), _class, k, et);
+		SpoofFused lop = new SpoofFused(inputs, getDataType(), getValueType(), _class, _api, _genVarName, k, et);
 		setOutputDimensions(lop);
 		setLineNumbers(lop);
 		setLops(lop);
@@ -140,12 +153,16 @@ public class SpoofFusedOp extends MultiThreadedHop
 
 	@Override
 	public String getOpString() {
-		return "spoof("+_class.getSimpleName()+")";
-	}
+		if(_class != null)
+			return "spoof("+_class.getSimpleName()+")";
+		else
+			return "spoof(" + getName() + ")";	}
 	
 	public String getClassName() {
-		return _class.getName();
-	}
+		if(_class != null)
+			return _class.getName();
+		else
+			return "spoof" + getName();	}
 	
 	@Override
 	protected DataCharacteristics inferOutputCharacteristics( MemoTable memo )
@@ -297,11 +314,12 @@ public class SpoofFusedOp extends MultiThreadedHop
 		
 		SpoofFusedOp that2 = (SpoofFusedOp)that;
 		//note: class implies dims type as well
-		boolean ret = ( _class.equals(that2._class)
+		boolean ret = (Objects.equals(_class, that2._class)
 				&& _distSupported == that2._distSupported
 				&& _maxNumThreads == that2._maxNumThreads
 				&& _constDim2 == that2._constDim2
-				&& getInput().size() == that2.getInput().size());
+				&& getInput().size() == that2.getInput().size()
+				&& _api == that2._api);
 		
 		if( ret ) {
 			for( int i=0; i<getInput().size(); i++ )
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNode.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNode.java
index 38a7bc3..a2f918e 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNode.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNode.java
@@ -25,6 +25,9 @@ import org.apache.sysds.hops.codegen.template.TemplateUtils;
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.runtime.controlprogram.parfor.util.IDSequence;
 import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
+
+import static org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI.CUDA;
 
 public abstract class CNode
 {
@@ -79,7 +82,10 @@ public abstract class CNode
 	public String getVarname() {
 		return _genVar;
 	}
-	
+
+	public String getVarname(GeneratorAPI api) { return getVarname(); }
+
+
 	public String getVectorLength() {
 		if( getVarname().startsWith("a") )
 			return "len";
@@ -161,7 +167,7 @@ public abstract class CNode
 		setVisited(false);
 	}
 	
-	public abstract String codegen(boolean sparse);
+	public abstract String codegen(boolean sparse, GeneratorAPI api);
 	
 	public abstract void setOutputDims();
 	
@@ -228,4 +234,36 @@ public abstract class CNode
 		
 		return tmp;
 	}
+
+	protected CodeTemplate getLanguageTemplateClass(CNode caller, GeneratorAPI api) {
+		switch (api) {
+			case CUDA:
+				if(caller instanceof CNodeCell)
+					return new org.apache.sysds.hops.codegen.cplan.cpp.CellWise();
+				else if (caller instanceof CNodeUnary)
+					return new org.apache.sysds.hops.codegen.cplan.cpp.Unary();
+				else if (caller instanceof CNodeBinary)
+					return new org.apache.sysds.hops.codegen.cplan.cpp.Binary();
+				else if (caller instanceof CNodeTernary)
+					return new org.apache.sysds.hops.codegen.cplan.cpp.Ternary();
+				else
+					return null;
+			case JAVA:
+				if(caller instanceof CNodeCell)
+					return new org.apache.sysds.hops.codegen.cplan.java.CellWise();
+				else if (caller instanceof CNodeUnary)
+					return new org.apache.sysds.hops.codegen.cplan.java.Unary();
+				else if (caller instanceof CNodeBinary)
+					return new org.apache.sysds.hops.codegen.cplan.java.Binary();
+				else if (caller instanceof CNodeTernary)
+					return new org.apache.sysds.hops.codegen.cplan.java.Ternary();
+
+				else
+					return null;
+			default:
+				throw new RuntimeException("API not supported by code generator: " + api.toString());
+		}
+	}
+
+	public abstract boolean isSupported(GeneratorAPI api);
 }
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeBinary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeBinary.java
index eed8389..15a26bc 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeBinary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeBinary.java
@@ -25,10 +25,10 @@ import org.apache.commons.lang.StringUtils;
 import org.apache.sysds.hops.codegen.template.TemplateUtils;
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
 
+public class CNodeBinary extends CNode {
 
-public class CNodeBinary extends CNode
-{
 	public enum BinType {
 		//matrix multiplication operations
 		DOT_PRODUCT, VECT_MATRIXMULT, VECT_OUTERMULT_ADD,
@@ -76,154 +76,6 @@ public class CNodeBinary extends CNode
 			return ssComm || vsComm || vvComm;
 		}
 		
-		public String getTemplate(boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) {
-			switch (this) {
-				case DOT_PRODUCT:
-					return sparseLhs ? "    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" :
-									"    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
-				case VECT_MATRIXMULT:
-					return sparseLhs ? "    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" :
-									"    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
-				case VECT_OUTERMULT_ADD:
-					return  sparseLhs ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
-							sparseRhs ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2v%, %OUT%, %POS1%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
-									"    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
-				
-				//vector-scalar-add operations
-				case VECT_MULT_ADD:
-				case VECT_DIV_ADD:
-				case VECT_MINUS_ADD:
-				case VECT_PLUS_ADD:
-				case VECT_POW_ADD:
-				case VECT_XOR_ADD:
-				case VECT_MIN_ADD:
-				case VECT_MAX_ADD:	
-				case VECT_EQUAL_ADD:
-				case VECT_NOTEQUAL_ADD:
-				case VECT_LESS_ADD:
-				case VECT_LESSEQUAL_ADD:
-				case VECT_GREATER_ADD:
-				case VECT_GREATEREQUAL_ADD:
-				case VECT_CBIND_ADD: {
-					String vectName = getVectorPrimitiveName();
-					if( scalarVector )
-						return sparseLhs ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" : 
-										"    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
-					else	
-						return sparseLhs ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" : 
-										"    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
-				}
-				
-				//vector-scalar operations
-				case VECT_MULT_SCALAR:
-				case VECT_DIV_SCALAR:
-				case VECT_MINUS_SCALAR:
-				case VECT_PLUS_SCALAR:
-				case VECT_POW_SCALAR:
-				case VECT_XOR_SCALAR:
-				case VECT_BITWAND_SCALAR:
-				case VECT_MIN_SCALAR:
-				case VECT_MAX_SCALAR:
-				case VECT_EQUAL_SCALAR:
-				case VECT_NOTEQUAL_SCALAR:
-				case VECT_LESS_SCALAR:
-				case VECT_LESSEQUAL_SCALAR:
-				case VECT_GREATER_SCALAR:
-				case VECT_GREATEREQUAL_SCALAR: {
-					String vectName = getVectorPrimitiveName();
-					if( scalarVector )
-						return sparseRhs ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : 
-										"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS2%, %LEN%);\n";
-					else	
-						return sparseLhs ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : 
-										"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
-				}
-				
-				case VECT_CBIND:
-					if( scalarInput )
-						return  "    double[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%);\n";
-					else
-						return sparseLhs ? 
-								"    double[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : 
-								"    double[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%, %POS1%, %LEN%);\n";
-				
-				//vector-vector operations
-				case VECT_MULT:
-				case VECT_DIV:
-				case VECT_MINUS:
-				case VECT_PLUS:
-				case VECT_XOR:
-				case VECT_BITWAND:
-				case VECT_BIASADD:
-				case VECT_BIASMULT:
-				case VECT_MIN:
-				case VECT_MAX:
-				case VECT_EQUAL:
-				case VECT_NOTEQUAL:
-				case VECT_LESS:
-				case VECT_LESSEQUAL:
-				case VECT_GREATER:
-				case VECT_GREATEREQUAL: {
-					String vectName = getVectorPrimitiveName();
-					return sparseLhs ? 
-						"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN%);\n" : 
-						   sparseRhs ?
-						"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %POS1%, %IN2i%, %POS2%, alen, %LEN%);\n" : 
-						"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
-				}
-				
-				//scalar-scalar operations
-				case MULT:
-					return "    double %TMP% = %IN1% * %IN2%;\n";
-				
-				case DIV:
-					return "    double %TMP% = %IN1% / %IN2%;\n";
-				case PLUS:
-					return "    double %TMP% = %IN1% + %IN2%;\n";
-				case MINUS:
-					return "    double %TMP% = %IN1% - %IN2%;\n";
-				case MODULUS:
-					return "    double %TMP% = LibSpoofPrimitives.mod(%IN1%, %IN2%);\n";
-				case INTDIV: 
-					return "    double %TMP% = LibSpoofPrimitives.intDiv(%IN1%, %IN2%);\n";
-				case LESS:
-					return "    double %TMP% = (%IN1% < %IN2%) ? 1 : 0;\n";
-				case LESSEQUAL:
-					return "    double %TMP% = (%IN1% <= %IN2%) ? 1 : 0;\n";
-				case GREATER:
-					return "    double %TMP% = (%IN1% > %IN2%) ? 1 : 0;\n";
-				case GREATEREQUAL: 
-					return "    double %TMP% = (%IN1% >= %IN2%) ? 1 : 0;\n";
-				case EQUAL:
-					return "    double %TMP% = (%IN1% == %IN2%) ? 1 : 0;\n";
-				case NOTEQUAL: 
-					return "    double %TMP% = (%IN1% != %IN2%) ? 1 : 0;\n";
-				
-				case MIN:
-					return "    double %TMP% = Math.min(%IN1%, %IN2%);\n";
-				case MAX:
-					return "    double %TMP% = Math.max(%IN1%, %IN2%);\n";
-				case LOG:
-					return "    double %TMP% = Math.log(%IN1%)/Math.log(%IN2%);\n";
-				case LOG_NZ:
-					return "    double %TMP% = (%IN1% == 0) ? 0 : Math.log(%IN1%)/Math.log(%IN2%);\n";	
-				case POW:
-					return "    double %TMP% = Math.pow(%IN1%, %IN2%);\n";
-				case MINUS1_MULT:
-					return "    double %TMP% = 1 - %IN1% * %IN2%;\n";
-				case MINUS_NZ:
-					return "    double %TMP% = (%IN1% != 0) ? %IN1% - %IN2% : 0;\n";
-				case XOR:
-					return "    double %TMP% = ( (%IN1% != 0) != (%IN2% != 0) ) ? 1 : 0;\n";
-				case BITWAND:
-					return "    double %TMP% = LibSpoofPrimitives.bwAnd(%IN1%, %IN2%);\n";
-				case SEQ_RIX:
-					return "    double %TMP% = %IN1% + grix * %IN2%;\n"; //0-based global rix
-				
-				default: 
-					throw new RuntimeException("Invalid binary type: "+this.toString());
-			}
-		}
 		public boolean isVectorPrimitive() {
 			return isVectorScalarPrimitive() 
 				|| isVectorVectorPrimitive()
@@ -286,15 +138,15 @@ public class CNodeBinary extends CNode
 	}
 	
 	@Override
-	public String codegen(boolean sparse) {
+	public String codegen(boolean sparse, GeneratorAPI api) {
 		if( isGenerated() )
 			return "";
 		
 		StringBuilder sb = new StringBuilder();
 		
 		//generate children
-		sb.append(_inputs.get(0).codegen(sparse));
-		sb.append(_inputs.get(1).codegen(sparse));
+		sb.append(_inputs.get(0).codegen(sparse, api));
+		sb.append(_inputs.get(1).codegen(sparse, api));
 		
 		//generate binary operation (use sparse template, if data input)
 		boolean lsparseLhs = sparse && _inputs.get(0) instanceof CNodeData 
@@ -305,12 +157,14 @@ public class CNodeBinary extends CNode
 		boolean scalarVector = (_inputs.get(0).getDataType().isScalar()
 			&& _inputs.get(1).getDataType().isMatrix());
 		String var = createVarname();
-		String tmp = _type.getTemplate(lsparseLhs, lsparseRhs, scalarVector, scalarInput);
+//		String tmp = _type.getTemplate(api, lang, lsparseLhs, lsparseRhs, scalarVector, scalarInput);
+		String tmp = getLanguageTemplateClass(this, api).getTemplate(_type, lsparseLhs, lsparseRhs, scalarVector, scalarInput);
+
 		tmp = tmp.replace("%TMP%", var);
 		
 		//replace input references and start indexes
 		for( int j=0; j<2; j++ ) {
-			String varj = _inputs.get(j).getVarname();
+			String varj = _inputs.get(j).getVarname(api);
 			
 			//replace sparse and dense inputs
 			tmp = tmp.replace("%IN"+(j+1)+"v%", varj+"vals");
@@ -560,4 +414,15 @@ public class CNodeBinary extends CNode
 		return super.equals(that)
 			&& _type == that._type;
 	}
+
+	@Override
+	public boolean isSupported(GeneratorAPI api) {
+		boolean is_supported = (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA);
+		int i = 0;
+		while(is_supported && i < _inputs.size()) {
+			CNode in = _inputs.get(i++);
+			is_supported = in.isSupported(api);
+		}
+		return  is_supported;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
index a894bae..3ea3d3b 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
@@ -22,31 +22,14 @@ package org.apache.sysds.hops.codegen.cplan;
 import java.util.ArrayList;
 
 import org.apache.sysds.common.Types.AggOp;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
 import org.apache.sysds.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
 import org.apache.sysds.runtime.codegen.SpoofCellwise.CellType;
 import org.apache.sysds.runtime.util.UtilFunctions;
 
 public class CNodeCell extends CNodeTpl 
-{	
-	private static final String TEMPLATE = 
-			  "package codegen;\n"
-			+ "import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;\n"
-			+ "import org.apache.sysds.runtime.codegen.SpoofCellwise;\n"
-			+ "import org.apache.sysds.runtime.codegen.SpoofCellwise.AggOp;\n"
-			+ "import org.apache.sysds.runtime.codegen.SpoofCellwise.CellType;\n"
-			+ "import org.apache.sysds.runtime.codegen.SpoofOperator.SideInput;\n"
-			+ "import org.apache.commons.math3.util.FastMath;\n"
-			+ "\n"
-			+ "public final class %TMP% extends SpoofCellwise {\n" 
-			+ "  public %TMP%() {\n"
-			+ "    super(CellType.%TYPE%, %SPARSE_SAFE%, %SEQ%, %AGG_OP%);\n"
-			+ "  }\n"
-			+ "  protected double genexec(double a, SideInput[] b, double[] scalars, int m, int n, long grix, int rix, int cix) { \n"
-			+ "%BODY_dense%"
-			+ "    return %OUT%;\n"
-			+ "  }\n"
-			+ "}\n";
-	
+{
 	private CellType _type = null;
 	private AggOp _aggOp = null;
 	private boolean _sparseSafe = false;
@@ -83,7 +66,25 @@ public class CNodeCell extends CNodeTpl
 	public AggOp getAggOp() {
 		return _aggOp;
 	}
-	
+
+	public SpoofCellwise.AggOp getSpoofAggOp() {
+		if(_aggOp != null)
+			switch(_aggOp) {
+				case SUM:
+					return SpoofCellwise.AggOp.SUM;
+				case SUM_SQ:
+					return SpoofCellwise.AggOp.SUM_SQ;
+				case MIN:
+					return SpoofCellwise.AggOp.MIN;
+				case MAX:
+					return SpoofCellwise.AggOp.MAX;
+				default:
+					throw new RuntimeException("Unsupported cell type: "+_type.toString());
+		}
+		else
+			return null;
+	}
+
 	public void setSparseSafe(boolean flag) {
 		_sparseSafe = flag;
 	}
@@ -114,34 +115,63 @@ public class CNodeCell extends CNodeTpl
 		rRenameDataNode(_output, _inputs.get(0), "a");
 		renameInputs(_inputs, 1);
 	}
-	
-	@Override
-	public String codegen(boolean sparse) {
-		String tmp = TEMPLATE;
-		
+
+	public String codegen(boolean sparse, GeneratorAPI _api) {
+		api = _api;
+
+		String tmp = getLanguageTemplateClass(this, api).getTemplate(_type);
+
 		//generate dense/sparse bodies
-		String tmpDense = _output.codegen(false);
+		String tmpDense = _output.codegen(false, api);
 		_output.resetGenerated();
 
 		tmp = tmp.replace("%TMP%", createVarname());
 		tmp = tmp.replace("%BODY_dense%", tmpDense);
 		
 		//return last TMP
-		tmp = tmp.replace("%OUT%", _output.getVarname());
-		
+		tmp = tmp.replaceAll("%OUT%", _output.getVarname());
+
 		//replace meta data information
-		tmp = tmp.replace("%TYPE%", getCellType().name());
-		tmp = tmp.replace("%AGG_OP%", (_aggOp!=null) ? "AggOp."+_aggOp.name() : "null" );
+		tmp = tmp.replaceAll("%TYPE%", getCellType().name());
+		tmp = tmp.replace("%AGG_OP_NAME%", (_aggOp != null) ? "AggOp." + _aggOp.name() : "null");
 		tmp = tmp.replace("%SPARSE_SAFE%", String.valueOf(isSparseSafe()));
 		tmp = tmp.replace("%SEQ%", String.valueOf(containsSeq()));
-		
+
+		if(api == GeneratorAPI.CUDA) {
+			// ToDo: initial_value is misused to pass VT (values per thread) to no_agg operator
+			String agg_op = "IdentityOp";
+			String initial_value = "(T)4.0";
+			if(_aggOp != null)
+			switch(_aggOp) {
+				case SUM:
+					agg_op = "SumOp";
+					initial_value = "(T)0.0";
+					break;
+				case SUM_SQ:
+					agg_op = "SumSqOp";
+					initial_value = "(T)0.0";
+					break;
+				case MIN:
+					agg_op = "MinOp";
+					initial_value = "MAX<T>()";
+					break;
+				case MAX:
+					agg_op = "MaxOp";
+					initial_value = "-MAX<T>()";
+					break;
+				default:
+					agg_op = "IdentityOp";
+					initial_value = "(T)0.0";
+			}
+
+			tmp = tmp.replaceAll("%AGG_OP%", agg_op);
+			tmp = tmp.replaceAll("%INITIAL_VALUE%", initial_value);
+		}
 		return tmp;
 	}
 
 	@Override
 	public void setOutputDims() {
-		
-		
 	}
 
 	@Override
@@ -206,4 +236,8 @@ public class CNodeCell extends CNodeTpl
 		sb.append("]");
 		return sb.toString();
 	}
+	@Override
+	public boolean isSupported(GeneratorAPI api) {
+		return (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA) && _output.isSupported(api);
+	}
 }
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeData.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeData.java
index 11d893e..b91c66f 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeData.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeData.java
@@ -19,9 +19,13 @@
 
 package org.apache.sysds.hops.codegen.cplan;
 
+import org.apache.commons.lang.StringUtils;
 import org.apache.sysds.hops.Hop;
 import org.apache.sysds.common.Types.DataType;
+import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
+import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision;
 
 public class CNodeData extends CNode 
 {
@@ -54,17 +58,48 @@ public class CNodeData extends CNode
 	
 	@Override
 	public String getVarname() {
-		if( "NaN".equals(_name) )
+		if ("NaN".equals(_name))
 			return "Double.NaN";
-		else if( "Infinity".equals(_name) )
+		else if ("Infinity".equals(_name))
 			return "Double.POSITIVE_INFINITY";
-		else if( "-Infinity".equals(_name) )
+		else if ("-Infinity".equals(_name))
 			return "Double.NEGATIVE_INFINITY";
-		else if( "true".equals(_name) || "false".equals(_name) )
+		else if ("true".equals(_name) || "false".equals(_name))
 			return "true".equals(_name) ? "1d" : "0d";
 		else
 			return _name;
 	}
+
+	public String getVarname(GeneratorAPI api) {
+		if(api == GeneratorAPI.JAVA) {
+			if ("NaN".equals(_name))
+				return "Double.NaN";
+			else if ("Infinity".equals(_name))
+				return "Double.POSITIVE_INFINITY";
+			else if ("-Infinity".equals(_name))
+				return "Double.NEGATIVE_INFINITY";
+			else if ("true".equals(_name) || "false".equals(_name))
+				return "true".equals(_name) ? "1d" : "0d";
+			else
+				return _name;
+		}
+		else if(api == GeneratorAPI.CUDA) {
+			if ("NaN".equals(_name))
+				return isSinglePrecision() ? "CUDART_NAN_F" : "CUDART_NAN";
+			else if ("Infinity".equals(_name))
+				return isSinglePrecision() ? "CUDART_INF_F" : "CUDART_INF";
+			else if ("-Infinity".equals(_name))
+				return isSinglePrecision() ? "-CUDART_INF_F" : "-CUDART_INF";
+			else if ("true".equals(_name) || "false".equals(_name))
+				return "true".equals(_name) ? "1" : "0";
+			else if (StringUtils.isNumeric(_name))
+				return isSinglePrecision() ? _name + ".0f" : _name + ".0";
+			else
+				return _name;
+		}
+		else
+			throw new RuntimeException("Unknown GeneratorAPI: " + SpoofCompiler.API);
+	}
 	
 	public long getHopID() {
 		return _hopID;
@@ -80,7 +115,7 @@ public class CNodeData extends CNode
 	}
 	
 	@Override
-	public String codegen(boolean sparse) {
+	public String codegen(boolean sparse, GeneratorAPI api) {
 		return "";
 	}
 
@@ -113,4 +148,8 @@ public class CNodeData extends CNode
 				_name.equals(((CNodeData)o)._name) : 
 				_hopID == ((CNodeData)o)._hopID));
 	}
+	@Override
+	public boolean isSupported(GeneratorAPI api) {
+		return true;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeMultiAgg.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeMultiAgg.java
index 2a5dec8..895d945 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeMultiAgg.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeMultiAgg.java
@@ -22,11 +22,12 @@ package org.apache.sysds.hops.codegen.cplan;
 import java.util.ArrayList;
 import java.util.Arrays;
 
+import org.apache.commons.collections.CollectionUtils;
 import org.apache.sysds.hops.Hop;
 import org.apache.sysds.common.Types.AggOp;
 import org.apache.sysds.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
-import org.apache.sysds.runtime.util.CollectionUtils;
 import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
 
 public class CNodeMultiAgg extends CNodeTpl
 {
@@ -105,14 +106,14 @@ public class CNodeMultiAgg extends CNodeTpl
 	}
 	
 	@Override
-	public String codegen(boolean sparse) {
+	public String codegen(boolean sparse, GeneratorAPI api) {
 		// note: ignore sparse flag, generate both
 		String tmp = TEMPLATE;
 		
 		//generate dense/sparse bodies
 		StringBuilder sb = new StringBuilder();
 		for( CNode out : _outputs )
-			sb.append(out.codegen(false));
+			sb.append(out.codegen(false, api));
 		for( CNode out : _outputs )
 			out.resetGenerated();
 
@@ -181,7 +182,7 @@ public class CNodeMultiAgg extends CNodeTpl
 			return false;
 		CNodeMultiAgg that = (CNodeMultiAgg)o;
 		return super.equals(o)
-			&& CollectionUtils.equals(_aggOps, that._aggOps)
+			&& CollectionUtils.isEqualCollection(_aggOps, that._aggOps)	
 			&& equalInputReferences(
 				_outputs, that._outputs, _inputs, that._inputs);
 	}
@@ -205,4 +206,14 @@ public class CNodeMultiAgg extends CNodeTpl
 				return null;
 		}
 	}
+	@Override
+	public boolean isSupported(GeneratorAPI api) {
+		boolean is_supported = (api == GeneratorAPI.JAVA);
+		int i = 0;
+		while(is_supported && i < _inputs.size()) {
+			CNode in = _inputs.get(i++);
+			is_supported = in.isSupported(api);
+		}
+		return  is_supported;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeNary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeNary.java
index a1b110d..5500ddb 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeNary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeNary.java
@@ -27,6 +27,7 @@ import org.apache.sysds.hops.codegen.template.TemplateUtils;
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.runtime.util.DnnUtils;
 import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
 
 public class CNodeNary extends CNode
 {
@@ -43,7 +44,7 @@ public class CNodeNary extends CNode
 					return true;
 			return false;
 		}
-		public String getTemplate(boolean sparseGen, long len, ArrayList<CNode> inputs) {
+		public String getTemplate(boolean sparseGen, long len, ArrayList<CNode> inputs, GeneratorAPI api) {
 			switch (this) {
 				case VECT_CBIND:
 					StringBuilder sb = new StringBuilder();
@@ -111,7 +112,7 @@ public class CNodeNary extends CNode
 	}
 	
 	@Override
-	public String codegen(boolean sparse) {
+	public String codegen(boolean sparse, GeneratorAPI api) {
 		if( isGenerated() )
 			return "";
 		
@@ -119,14 +120,14 @@ public class CNodeNary extends CNode
 		
 		//generate children
 		for(CNode in : _inputs)
-			sb.append(in.codegen(sparse));
+			sb.append(in.codegen(sparse, api));
 		
 		//generate nary operation (use sparse template, if data input)
 		boolean lsparse = sparse && (_inputs.get(0) instanceof CNodeData
 			&& _inputs.get(0).getVarname().startsWith("a")
 			&& !_inputs.get(0).isLiteral());
 		String var = createVarname();
-		String tmp = _type.getTemplate(lsparse, _cols, _inputs);
+		String tmp = _type.getTemplate(lsparse, _cols, _inputs, api);
 		tmp = tmp.replace("%TMP%", var);
 		
 		//replace sparse and dense inputs
@@ -219,7 +220,18 @@ public class CNodeNary extends CNode
 		return super.equals(that)
 			&& _type == that._type;
 	}
-	
+
+	@Override
+	public boolean isSupported(GeneratorAPI api) {
+		boolean is_supported = (api == GeneratorAPI.JAVA);
+		int i = 0;
+		while(is_supported && i < _inputs.size()) {
+			CNode in = _inputs.get(i++);
+			is_supported = in.isSupported(api);
+		}
+		return  is_supported;
+	}
+
 	private static String getDnnParameterString(List<CNode> inputs, boolean unary) {
 		int off = unary ? 0 : 1;
 		
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeOuterProduct.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeOuterProduct.java
index 9f0aa69..6a3a647 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeOuterProduct.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeOuterProduct.java
@@ -25,7 +25,7 @@ import org.apache.sysds.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
 import org.apache.sysds.lops.MMTSJ;
 import org.apache.sysds.runtime.codegen.SpoofOuterProduct.OutProdType;
 import org.apache.sysds.runtime.util.UtilFunctions;
-
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
 
 public class CNodeOuterProduct extends CNodeTpl
 {	
@@ -78,12 +78,12 @@ public class CNodeOuterProduct extends CNodeTpl
 	}
 	
 	@Override
-	public String codegen(boolean sparse) {
+	public String codegen(boolean sparse, GeneratorAPI api) {
 		// note: ignore sparse flag, generate both
 		String tmp = TEMPLATE;
 		
 		//generate dense/sparse bodies
-		String tmpDense = _output.codegen(false);
+		String tmpDense = _output.codegen(false, api);
 		_output.resetGenerated();
 
 		tmp = tmp.replace("%TMP%", createVarname());
@@ -186,4 +186,15 @@ public class CNodeOuterProduct extends CNodeTpl
 		sb.append("]");
 		return sb.toString();
 	}
+
+	@Override
+	public boolean isSupported(GeneratorAPI api) {
+		boolean is_supported = (api == GeneratorAPI.JAVA);
+		int i = 0;
+		while(is_supported && i < _inputs.size()) {
+			CNode in = _inputs.get(i++);
+			is_supported = in.isSupported(api);
+		}
+		return  is_supported;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java
index 94f9ed9..b3304bd 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java
@@ -26,6 +26,7 @@ import org.apache.sysds.hops.codegen.cplan.CNodeBinary.BinType;
 import org.apache.sysds.hops.codegen.template.TemplateUtils;
 import org.apache.sysds.runtime.codegen.SpoofRowwise.RowType;
 import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
 
 public class CNodeRow extends CNodeTpl
 {
@@ -95,15 +96,15 @@ public class CNodeRow extends CNodeTpl
 	}
 	
 	@Override
-	public String codegen(boolean sparse) {
+	public String codegen(boolean sparse, GeneratorAPI api) {
 		// note: ignore sparse flag, generate both
 		String tmp = TEMPLATE;
 		
 		//generate dense/sparse bodies
-		String tmpDense = _output.codegen(false)
+		String tmpDense = _output.codegen(false, api)
 			+ getOutputStatement(_output.getVarname());
 		_output.resetGenerated();
-		String tmpSparse = _output.codegen(true)
+		String tmpSparse = _output.codegen(true, api)
 			+ getOutputStatement(_output.getVarname());
 		tmp = tmp.replace("%TMP%", createVarname());
 		tmp = tmp.replace("%BODY_dense%", tmpDense);
@@ -209,4 +210,15 @@ public class CNodeRow extends CNodeTpl
 		sb.append("]");
 		return sb.toString();
 	}
+
+	@Override
+	public boolean isSupported(GeneratorAPI api) {
+		boolean is_supported = (api == GeneratorAPI.JAVA);
+		int i = 0;
+		while(is_supported && i < _inputs.size()) {
+			CNode in = _inputs.get(i++);
+			is_supported = in.isSupported(api);
+		}
+		return  is_supported;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTernary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTernary.java
index 8939db9..5e81109 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTernary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTernary.java
@@ -23,7 +23,7 @@ import java.util.Arrays;
 
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.runtime.util.UtilFunctions;
-
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
 
 public class CNodeTernary extends CNode
 {
@@ -37,43 +37,7 @@ public class CNodeTernary extends CNode
 			return Arrays.stream(values()).anyMatch(tt -> tt.name().equals(value));
 		}
 		
-		public String getTemplate(boolean sparse) {
-			switch (this) {
-				case PLUS_MULT:
-					return "    double %TMP% = %IN1% + %IN2% * %IN3%;\n";
-				
-				case MINUS_MULT:
-					return "    double %TMP% = %IN1% - %IN2% * %IN3%;\n";
-				
-				case BIASADD:
-					return "    double %TMP% = %IN1% + getValue(%IN2%, cix/%IN3%);\n";
-				
-				case BIASMULT:
-					return "    double %TMP% = %IN1% * getValue(%IN2%, cix/%IN3%);\n";
-				
-				case REPLACE:
-					return "    double %TMP% = (%IN1% == %IN2% || (Double.isNaN(%IN1%) "
-							+ "&& Double.isNaN(%IN2%))) ? %IN3% : %IN1%;\n";
-				
-				case REPLACE_NAN:
-					return "    double %TMP% = Double.isNaN(%IN1%) ? %IN3% : %IN1%;\n";
-				
-				case IFELSE:
-					return "    double %TMP% = (%IN1% != 0) ? %IN2% : %IN3%;\n";
-				
-				case LOOKUP_RC1:
-					return sparse ?
-						"    double %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, %IN3%-1);\n" :
-						"    double %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n";
-					
-				case LOOKUP_RVECT1:
-					return "    double[] %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1);\n";
-					
-				default: 
-					throw new RuntimeException("Invalid ternary type: "+this.toString());
-			}
-		}
-		
+
 		public boolean isVectorPrimitive() {
 			return (this == LOOKUP_RVECT1);
 		}
@@ -94,23 +58,25 @@ public class CNodeTernary extends CNode
 	}
 	
 	@Override
-	public String codegen(boolean sparse) {
+	public String codegen(boolean sparse, GeneratorAPI api) {
 		if( isGenerated() )
 			return "";
 			
 		StringBuilder sb = new StringBuilder();
 		
 		//generate children
-		sb.append(_inputs.get(0).codegen(sparse));
-		sb.append(_inputs.get(1).codegen(sparse));
-		sb.append(_inputs.get(2).codegen(sparse));
+		sb.append(_inputs.get(0).codegen(sparse, api));
+		sb.append(_inputs.get(1).codegen(sparse, api));
+		sb.append(_inputs.get(2).codegen(sparse, api));
 		
 		//generate binary operation
 		boolean lsparse = sparse && (_inputs.get(0) instanceof CNodeData
 			&& _inputs.get(0).getVarname().startsWith("a")
 			&& !_inputs.get(0).isLiteral());
 		String var = createVarname();
-		String tmp = _type.getTemplate(lsparse);
+//		String tmp = _type.getTemplate(lsparse, api, lang);
+		String tmp = getLanguageTemplateClass(this, api).getTemplate(_type, lsparse);
+
 		tmp = tmp.replace("%TMP%", var);
 		for( int j=1; j<=3; j++ ) {
 			String varj = _inputs.get(j-1).getVarname();
@@ -186,4 +152,14 @@ public class CNodeTernary extends CNode
 		return super.equals(that)
 			&& _type == that._type;
 	}
+	@Override
+	public boolean isSupported(GeneratorAPI api) {
+		boolean is_supported = (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA);
+		int i = 0;
+		while(is_supported && i < _inputs.size()) {
+			CNode in = _inputs.get(i++);
+			is_supported = in.isSupported(api);
+		}
+		return  is_supported;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTpl.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTpl.java
index 82187bb..2026eb3 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTpl.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTpl.java
@@ -24,13 +24,15 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
-
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
 import org.apache.sysds.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
 
 public abstract class CNodeTpl extends CNode implements Cloneable
 {
 	private int _beginLine = -1;
-	
+
+	protected GeneratorAPI api = GeneratorAPI.AUTO;
+
 	public CNodeTpl(ArrayList<CNode> inputs, CNode output ) {
 		if(inputs.size() < 1)
 			throw new RuntimeException("Cannot pass empty inputs to the CNodeTpl");
@@ -74,7 +76,7 @@ public abstract class CNodeTpl extends CNode implements Cloneable
 	}
 	
 	public String codegen() {
-		return codegen(false);
+		return codegen(false, GeneratorAPI.AUTO);
 	}
 	
 	@Override
@@ -83,7 +85,7 @@ public abstract class CNodeTpl extends CNode implements Cloneable
 	public abstract SpoofOutputDimsType getOutputDimType();
 	
 	public abstract String getTemplateInfo();
-	
+
 	public abstract void renameInputs();
 	
 	protected void renameInputs(ArrayList<CNode> inputs, int startIndex) {
@@ -232,4 +234,6 @@ public abstract class CNodeTpl extends CNode implements Cloneable
 		}
 		return -1;
 	}
+
+	public GeneratorAPI getGeneratorAPI() { return api; }
 }
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeUnary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeUnary.java
index ca571ea..2ff9054 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeUnary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeUnary.java
@@ -25,7 +25,7 @@ import org.apache.commons.lang.ArrayUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.runtime.util.UtilFunctions;
-
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
 
 public class CNodeUnary extends CNode
 {
@@ -47,108 +47,7 @@ public class CNodeUnary extends CNode
 		public static boolean contains(String value) {
 			return Arrays.stream(values()).anyMatch(ut -> ut.name().equals(value));
 		}
-		
-		public String getTemplate(boolean sparse) {
-			switch( this ) {
-				case ROW_SUMS:
-				case ROW_SUMSQS:
-				case ROW_MINS:
-				case ROW_MAXS:
-				case ROW_MEANS:
-				case ROW_COUNTNNZS: {
-					String vectName = StringUtils.capitalize(name().substring(4, name().length()-1).toLowerCase());
-					return sparse ? "    double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, alen, len);\n": 
-									"    double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1%, %POS1%, %LEN%);\n"; 
-				}
-			
-				case VECT_EXP:
-				case VECT_POW2:
-				case VECT_MULT2: 
-				case VECT_SQRT: 
-				case VECT_LOG:
-				case VECT_ABS:
-				case VECT_ROUND:
-				case VECT_CEIL:
-				case VECT_FLOOR:
-				case VECT_SIGN:
-				case VECT_SIN:
-				case VECT_COS:
-				case VECT_TAN:
-				case VECT_ASIN:
-				case VECT_ACOS:
-				case VECT_ATAN:
-				case VECT_SINH:
-				case VECT_COSH:
-				case VECT_TANH:
-				case VECT_CUMSUM:
-				case VECT_CUMMIN:
-				case VECT_CUMMAX:
-				case VECT_SPROP:
-				case VECT_SIGMOID: {
-					String vectName = getVectorPrimitiveName();
-					return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, alen, len);\n" : 
-									"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %POS1%, %LEN%);\n";
-				}
-					
-				case EXP:
-					return "    double %TMP% = FastMath.exp(%IN1%);\n";
-				case LOOKUP_R:
-					return sparse ?
-						"    double %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, 0);\n" :
-						"    double %TMP% = getValue(%IN1%, rix);\n";
-				case LOOKUP_C:
-					return "    double %TMP% = getValue(%IN1%, n, 0, cix);\n";
-				case LOOKUP_RC:
-					return "    double %TMP% = getValue(%IN1%, n, rix, cix);\n";
-				case LOOKUP0:
-					return "    double %TMP% = %IN1%[0];\n";
-				case POW2:
-					return "    double %TMP% = %IN1% * %IN1%;\n";
-				case MULT2:
-					return "    double %TMP% = %IN1% + %IN1%;\n";
-				case ABS:
-					return "    double %TMP% = Math.abs(%IN1%);\n";
-				case SIN:
-					return "    double %TMP% = FastMath.sin(%IN1%);\n";
-				case COS: 
-					return "    double %TMP% = FastMath.cos(%IN1%);\n";
-				case TAN:
-					return "    double %TMP% = FastMath.tan(%IN1%);\n";
-				case ASIN:
-					return "    double %TMP% = FastMath.asin(%IN1%);\n";
-				case ACOS:
-					return "    double %TMP% = FastMath.acos(%IN1%);\n";
-				case ATAN:
-					return "    double %TMP% = Math.atan(%IN1%);\n";
-				case SINH:
-					return "    double %TMP% = FastMath.sinh(%IN1%);\n";
-				case COSH: 
-					return "    double %TMP% = FastMath.cosh(%IN1%);\n";
-				case TANH:
-					return "    double %TMP% = FastMath.tanh(%IN1%);\n";
-				case SIGN:
-					return "    double %TMP% = FastMath.signum(%IN1%);\n";
-				case SQRT:
-					return "    double %TMP% = Math.sqrt(%IN1%);\n";
-				case LOG:
-					return "    double %TMP% = Math.log(%IN1%);\n";
-				case ROUND: 
-					return "    double %TMP% = Math.round(%IN1%);\n";
-				case CEIL:
-					return "    double %TMP% = FastMath.ceil(%IN1%);\n";
-				case FLOOR:
-					return "    double %TMP% = FastMath.floor(%IN1%);\n";
-				case SPROP:
-					return "    double %TMP% = %IN1% * (1 - %IN1%);\n";
-				case SIGMOID:
-					return "    double %TMP% = 1 / (1 + FastMath.exp(-%IN1%));\n";
-				case LOG_NZ:
-					return "    double %TMP% = (%IN1%==0) ? 0 : Math.log(%IN1%);\n";
-					
-				default: 
-					throw new RuntimeException("Invalid unary type: "+this.toString());
-			}
-		}
+
 		public boolean isVectorScalarPrimitive() {
 			return this == VECT_EXP || this == VECT_POW2
 				|| this == VECT_MULT2 || this == VECT_SQRT
@@ -196,21 +95,21 @@ public class CNodeUnary extends CNode
 	}
 
 	@Override
-	public String codegen(boolean sparse) {
+	public String codegen(boolean sparse, GeneratorAPI api) {
 		if( isGenerated() )
 			return "";
 		
 		StringBuilder sb = new StringBuilder();
 		
 		//generate children
-		sb.append(_inputs.get(0).codegen(sparse));
+		sb.append(_inputs.get(0).codegen(sparse, api));
 		
 		//generate unary operation
 		boolean lsparse = sparse && (_inputs.get(0) instanceof CNodeData
 			&& _inputs.get(0).getVarname().startsWith("a")
 			&& !_inputs.get(0).isLiteral());
 		String var = createVarname();
-		String tmp = _type.getTemplate(lsparse);
+		String tmp = getLanguageTemplateClass(this, api).getTemplate(_type, lsparse);
 		tmp = tmp.replace("%TMP%", var);
 		
 		//replace sparse and dense inputs
@@ -361,4 +260,14 @@ public class CNodeUnary extends CNode
 		return super.equals(that)
 			&& _type == that._type;
 	}
+	@Override
+	public boolean isSupported(GeneratorAPI api) {
+		boolean is_supported = (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA);
+		int i = 0;
+		while(is_supported && i < _inputs.size()) {
+			CNode in = _inputs.get(i++);
+			is_supported = in.isSupported(api);
+		}
+		return  is_supported;
+	}
 }
diff --git a/src/test/scripts/functions/codegen/SystemDS-config-codegen.xml b/src/main/java/org/apache/sysds/hops/codegen/cplan/CodeTemplate.java
similarity index 60%
copy from src/test/scripts/functions/codegen/SystemDS-config-codegen.xml
copy to src/main/java/org/apache/sysds/hops/codegen/cplan/CodeTemplate.java
index 1becb67..8a8a3be 100644
--- a/src/test/scripts/functions/codegen/SystemDS-config-codegen.xml
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CodeTemplate.java
@@ -1,4 +1,4 @@
-<!--
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -6,25 +6,31 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
--->
-
-<root>
-   <sysds.localtmpdir>/tmp/systemds</sysds.localtmpdir>
-   <sysds.scratch>scratch_space</sysds.scratch>
-   <sysds.optlevel>7</sysds.optlevel>
-   <sysds.codegen.enabled>true</sysds.codegen.enabled>
-   <sysds.codegen.plancache>true</sysds.codegen.plancache>
-   <sysds.codegen.literals>1</sysds.codegen.literals>
-
-   <!-- The number of theads for the spark instance artificially selected-->
-   <sysds.local.spark.number.threads>16</sysds.local.spark.number.threads>
-</root>
\ No newline at end of file
+ */
+
+package org.apache.sysds.hops.codegen.cplan;
+
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+public interface CodeTemplate {
+
+    String getTemplate();
+
+    String getTemplate(CNodeUnary.UnaryType type, boolean sparse);
+
+    String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector,
+                              boolean scalarInput);
+
+    String getTemplate(CNodeTernary.TernaryType type, boolean sparse);
+
+    String getTemplate(SpoofCellwise.CellType ct);
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Binary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Binary.java
new file mode 100644
index 0000000..8d78b7b
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Binary.java
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.cpp;
+
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision;
+
+public class Binary implements CodeTemplate {
+    @Override
+    public String getTemplate() {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(SpoofCellwise.CellType ct) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector,
+                              boolean scalarInput) {
+
+        if(isSinglePrecision()) {
+            switch(type) {
+                case DOT_PRODUCT:
+                    return sparseLhs ? "    T %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" : "    T %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+                case VECT_MATRIXMULT:
+                    return sparseLhs ? "    T[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" : "    T[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+                case VECT_OUTERMULT_ADD:
+                    return sparseLhs ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : sparseRhs ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2v%, %OUT%, %POS1%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : "    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
+
+                //vector-scalar-add operations
+                case VECT_MULT_ADD:
+                case VECT_DIV_ADD:
+                case VECT_MINUS_ADD:
+                case VECT_PLUS_ADD:
+                case VECT_POW_ADD:
+                case VECT_XOR_ADD:
+                case VECT_MIN_ADD:
+                case VECT_MAX_ADD:
+                case VECT_EQUAL_ADD:
+                case VECT_NOTEQUAL_ADD:
+                case VECT_LESS_ADD:
+                case VECT_LESSEQUAL_ADD:
+                case VECT_GREATER_ADD:
+                case VECT_GREATEREQUAL_ADD:
+                case VECT_CBIND_ADD: {
+                    String vectName = type.getVectorPrimitiveName();
+                    if(scalarVector)
+                        return sparseLhs ? "    LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" : "    LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
+                    else
+                        return sparseLhs ? "    LibSpoofPrimitives.vect" + vectName + "Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" : "    LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
+                }
+
+                //vector-scalar operations
+                case VECT_MULT_SCALAR:
+                case VECT_DIV_SCALAR:
+                case VECT_MINUS_SCALAR:
+                case VECT_PLUS_SCALAR:
+                case VECT_POW_SCALAR:
+                case VECT_XOR_SCALAR:
+                case VECT_BITWAND_SCALAR:
+                case VECT_MIN_SCALAR:
+                case VECT_MAX_SCALAR:
+                case VECT_EQUAL_SCALAR:
+                case VECT_NOTEQUAL_SCALAR:
+                case VECT_LESS_SCALAR:
+                case VECT_LESSEQUAL_SCALAR:
+                case VECT_GREATER_SCALAR:
+                case VECT_GREATEREQUAL_SCALAR: {
+                    String vectName = type.getVectorPrimitiveName();
+                    if(scalarVector)
+                        return sparseRhs ? "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS2%, %LEN%);\n";
+                    else
+                        return sparseLhs ? "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
+                }
+
+                case VECT_CBIND:
+                    if(scalarInput)
+                        return "    T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%);\n";
+                    else
+                        return sparseLhs ? "    T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : "    T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%, %POS1%, %LEN%);\n";
+
+                    //vector-vector operations
+                case VECT_MULT:
+                case VECT_DIV:
+                case VECT_MINUS:
+                case VECT_PLUS:
+                case VECT_XOR:
+                case VECT_BITWAND:
+                case VECT_BIASADD:
+                case VECT_BIASMULT:
+                case VECT_MIN:
+                case VECT_MAX:
+                case VECT_EQUAL:
+                case VECT_NOTEQUAL:
+                case VECT_LESS:
+                case VECT_LESSEQUAL:
+                case VECT_GREATER:
+                case VECT_GREATEREQUAL: {
+                    String vectName = type.getVectorPrimitiveName();
+                    return sparseLhs ? "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN%);\n" : sparseRhs ? "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2v%, %POS1%, %IN2i%, %POS2%, alen, %LEN%);\n" : "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+                }
+
+                //scalar-scalar operations
+                case MULT:
+                    return "    T %TMP% = %IN1% * %IN2%;\n";
+                case DIV:
+                    return "    T %TMP% = %IN1% / %IN2%;\n";
+                case PLUS:
+                    return "    T %TMP% = %IN1% + %IN2%;\n";
+                case MINUS:
+                    return "    T %TMP% = %IN1% - %IN2%;\n";
+                case MODULUS:
+                    return "    T %TMP% = modulus(%IN1%, %IN2%);\n";
+                case INTDIV:
+                    return "    T %TMP% = intDiv(%IN1%, %IN2%);\n";
+                case LESS:
+                    return "    T %TMP% = (%IN1% < %IN2%) ? 1 : 0;\n";
+                case LESSEQUAL:
+                    return "    T %TMP% = (%IN1% <= %IN2%) ? 1 : 0;\n";
+                case GREATER:
+                    return "    T %TMP% = (%IN1% > %IN2%) ? 1 : 0;\n";
+                case GREATEREQUAL:
+                    return "    T %TMP% = (%IN1% >= %IN2%) ? 1 : 0;\n";
+                case EQUAL:
+                    return "    T %TMP% = (%IN1% == %IN2%) ? 1 : 0;\n";
+                case NOTEQUAL:
+                    return "    T %TMP% = (%IN1% != %IN2%) ? 1 : 0;\n";
+
+                case MIN:
+                    return "    T %TMP% = fminf(%IN1%, %IN2%);\n";
+                case MAX:
+                    return "    T %TMP% = fmaxf(%IN1%, %IN2%);\n";
+                case LOG:
+                    return "    T %TMP% = logf(%IN1%)/Math.log(%IN2%);\n";
+                case LOG_NZ:
+                    return "    T %TMP% = (%IN1% == 0) ? 0 : logf(%IN1%) / logf(%IN2%);\n";
+                case POW:
+                    return "    T %TMP% = powf(%IN1%, %IN2%);\n";
+                case MINUS1_MULT:
+                    return "    T %TMP% = 1 - %IN1% * %IN2%;\n";
+                case MINUS_NZ:
+                    return "    T %TMP% = (%IN1% != 0) ? %IN1% - %IN2% : 0;\n";
+                case XOR:
+                    return "    T %TMP% = ( (%IN1% != 0) != (%IN2% != 0) ) ? 1.0f : 0.0f;\n";
+                case BITWAND:
+                    return "    T %TMP% = bwAnd(%IN1%, %IN2%);\n";
+                case SEQ_RIX:
+                    return "    T %TMP% = %IN1% + grix * %IN2%;\n"; //0-based global rix
+
+                default:
+                    throw new RuntimeException("Invalid binary type: " + this.toString());
+            }
+        }
+        else {
+            switch(type) {
+                case DOT_PRODUCT:
+                    return sparseLhs ? "    T %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" : "    T %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+                case VECT_MATRIXMULT:
+                    return sparseLhs ? "    T[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" : "    T[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+                case VECT_OUTERMULT_ADD:
+                    return sparseLhs ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : sparseRhs ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2v%, %OUT%, %POS1%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : "    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
+
+                //vector-scalar-add operations
+                case VECT_MULT_ADD:
+                case VECT_DIV_ADD:
+                case VECT_MINUS_ADD:
+                case VECT_PLUS_ADD:
+                case VECT_POW_ADD:
+                case VECT_XOR_ADD:
+                case VECT_MIN_ADD:
+                case VECT_MAX_ADD:
+                case VECT_EQUAL_ADD:
+                case VECT_NOTEQUAL_ADD:
+                case VECT_LESS_ADD:
+                case VECT_LESSEQUAL_ADD:
+                case VECT_GREATER_ADD:
+                case VECT_GREATEREQUAL_ADD:
+                case VECT_CBIND_ADD: {
+                    String vectName = type.getVectorPrimitiveName();
+                    if(scalarVector)
+                        return sparseLhs ? "    LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" : "    LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
+                    else
+                        return sparseLhs ? "    LibSpoofPrimitives.vect" + vectName + "Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" : "    LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
+                }
+
+                //vector-scalar operations
+                case VECT_MULT_SCALAR:
+                case VECT_DIV_SCALAR:
+                case VECT_MINUS_SCALAR:
+                case VECT_PLUS_SCALAR:
+                case VECT_POW_SCALAR:
+                case VECT_XOR_SCALAR:
+                case VECT_BITWAND_SCALAR:
+                case VECT_MIN_SCALAR:
+                case VECT_MAX_SCALAR:
+                case VECT_EQUAL_SCALAR:
+                case VECT_NOTEQUAL_SCALAR:
+                case VECT_LESS_SCALAR:
+                case VECT_LESSEQUAL_SCALAR:
+                case VECT_GREATER_SCALAR:
+                case VECT_GREATEREQUAL_SCALAR: {
+                    String vectName = type.getVectorPrimitiveName();
+                    if(scalarVector)
+                        return sparseRhs ? "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS2%, %LEN%);\n";
+                    else
+                        return sparseLhs ? "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
+                }
+
+                case VECT_CBIND:
+                    if(scalarInput)
+                        return "    T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%);\n";
+                    else
+                        return sparseLhs ? "    T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : "    T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%, %POS1%, %LEN%);\n";
+
+                    //vector-vector operations
+                case VECT_MULT:
+                case VECT_DIV:
+                case VECT_MINUS:
+                case VECT_PLUS:
+                case VECT_XOR:
+                case VECT_BITWAND:
+                case VECT_BIASADD:
+                case VECT_BIASMULT:
+                case VECT_MIN:
+                case VECT_MAX:
+                case VECT_EQUAL:
+                case VECT_NOTEQUAL:
+                case VECT_LESS:
+                case VECT_LESSEQUAL:
+                case VECT_GREATER:
+                case VECT_GREATEREQUAL: {
+                    String vectName = type.getVectorPrimitiveName();
+                    return sparseLhs ? "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN%);\n" : sparseRhs ? "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2v%, %POS1%, %IN2i%, %POS2%, alen, %LEN%);\n" : "    T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+                }
+
+                //scalar-scalar operations
+                case MULT:
+                    return "    T %TMP% = %IN1% * %IN2%;\n";
+                case DIV:
+                    return "    T %TMP% = %IN1% / %IN2%;\n";
+                case PLUS:
+                    return "    T %TMP% = %IN1% + %IN2%;\n";
+                case MINUS:
+                    return "    T %TMP% = %IN1% - %IN2%;\n";
+                case MODULUS:
+                    return "    T %TMP% = modulus(%IN1%, %IN2%);\n";
+                case INTDIV:
+                    return "    T %TMP% = intDiv(%IN1%, %IN2%);\n";
+                case LESS:
+                    return "    T %TMP% = (%IN1% < %IN2%) ? 1.0 : 0.0;\n";
+                case LESSEQUAL:
+                    return "    T %TMP% = (%IN1% <= %IN2%) ? 1.0 : 0.0;\n";
+                case GREATER:
+                    return "    T %TMP% = (%IN1% > (%IN2% + EPSILON)) ? 1.0 : 0.0;\n";
+                case GREATEREQUAL:
+                    return "    T %TMP% = (%IN1% >= %IN2%) ? 1.0 : 0.0;\n";
+                case EQUAL:
+                    return "    T %TMP% = (%IN1% == %IN2%) ? 1.0 : 0.0;\n";
+                case NOTEQUAL:
+                    return "    T %TMP% = (%IN1% != %IN2%) ? 1.0 : 0.0;\n";
+
+                case MIN:
+                    return "    T %TMP% = min(%IN1%, %IN2%);\n";
+                case MAX:
+                    return "    T %TMP% = max(%IN1%, %IN2%);\n";
+                case LOG:
+                    return "    T %TMP% = log(%IN1%)/Math.log(%IN2%);\n";
+                case LOG_NZ:
+                    return "    T %TMP% = (%IN1% == 0) ? 0 : log(%IN1%) / log(%IN2%);\n";
+                case POW:
+                    return "    T %TMP% = pow(%IN1%, %IN2%);\n";
+                case MINUS1_MULT:
+                    return "    T %TMP% = 1 - %IN1% * %IN2%;\n";
+                case MINUS_NZ:
+                    return "    T %TMP% = (%IN1% != 0) ? %IN1% - %IN2% : 0;\n";
+                case XOR:
+//                    return "    T %TMP% = ( (%IN1% != 0.0) != (%IN2% != 0.0) ) ? 1.0 : 0.0;\n";
+                    return "    T %TMP% = ( (%IN1% < EPSILON) != (%IN2% < EPSILON) ) ? 1.0 : 0.0;\n";
+                case BITWAND:
+                    return "    T %TMP% = bwAnd(%IN1%, %IN2%);\n";
+                case SEQ_RIX:
+                    return "    T %TMP% = %IN1% + grix * %IN2%;\n"; //0-based global rix
+
+                default:
+                    throw new RuntimeException("Invalid binary type: " + this.toString());
+            }
+        }
+    }
+
+    @Override
+    public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/CellWise.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/CellWise.java
new file mode 100644
index 0000000..f76f3ec
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/CellWise.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.cpp;
+
+import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.conf.DMLConfig;
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+import org.apache.sysds.runtime.io.IOUtilFunctions;
+
+import java.io.*;
+import java.util.stream.Collectors;
+
+// ToDo: clean code template and load from file
+public class CellWise implements CodeTemplate {
+
+    private static final String TEMPLATE_PATH = "/cuda/spoof/cellwise.cu";
+
+    @Override
+    public String getTemplate() {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(SpoofCellwise.CellType ct) {
+        try {
+            // Change prefix to the code template file if running from jar. File were extracted to a temporary
+            // directory in that case. By default we load the template from the source tree.
+            if(CellWise.class.getProtectionDomain().getCodeSource().getLocation().getPath().contains(".jar"))
+                return(IOUtilFunctions.toString(new FileInputStream(ConfigurationManager.getDMLConfig()
+                        .getTextValue(DMLConfig.LOCAL_TMP_DIR) + TEMPLATE_PATH)));
+            else
+                return IOUtilFunctions.toString(new FileInputStream(System.getProperty("user.dir") +
+                        "/src/main" + TEMPLATE_PATH));
+        }
+        catch(IOException e) {
+            System.out.println(e.getMessage());
+            return null;
+        }
+    }
+
+    @Override
+    public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Ternary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Ternary.java
new file mode 100644
index 0000000..3edfcea
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Ternary.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.cpp;
+
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision;
+
+public class Ternary implements CodeTemplate {
+
+    @Override
+    public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+        if(isSinglePrecision()) {
+            switch (type) {
+                case PLUS_MULT:
+                    return "    T %TMP% = %IN1% + %IN2% * %IN3%;\n";
+
+                case MINUS_MULT:
+                    return "    T %TMP% = %IN1% - %IN2% * %IN3%;\n";
+
+                case BIASADD:
+                    return "    T %TMP% = %IN1% + getValue(%IN2%, cix/%IN3%);\n";
+
+                case BIASMULT:
+                    return "    T %TMP% = %IN1% * getValue(%IN2%, cix/%IN3%);\n";
+
+                case REPLACE:
+                    return "    T %TMP% = (%IN1% == %IN2% || (isnan(%IN1%) "
+                            + "&& isnan(%IN2%))) ? %IN3% : %IN1%;\n";
+
+                case REPLACE_NAN:
+                    return "    T %TMP% = isnan(%IN1%) ? %IN3% : %IN1%;\n";
+
+                case IFELSE:
+                    return "    T %TMP% = (%IN1% != 0) ? %IN2% : %IN3%;\n";
+
+                case LOOKUP_RC1:
+                    return sparse ?
+                            "    T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, %IN3%-1);\n" :
+                            "    T %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n";
+
+                case LOOKUP_RVECT1:
+                    return "    T[] %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1);\n";
+
+                default:
+                    throw new RuntimeException("Invalid ternary type: " + this.toString());
+            }
+        }
+        else {
+            switch (type) {
+                case PLUS_MULT:
+                    return "    T %TMP% = %IN1% + %IN2% * %IN3%;\n";
+
+                case MINUS_MULT:
+                    return "    T %TMP% = %IN1% - %IN2% * %IN3%;\n";
+
+                case BIASADD:
+                    return "    T %TMP% = %IN1% + getValue(%IN2%, cix/%IN3%);\n";
+
+                case BIASMULT:
+                    return "    T %TMP% = %IN1% * getValue(%IN2%, cix/%IN3%);\n";
+
+                case REPLACE:
+                    return "    T %TMP% = (%IN1% == %IN2% || (isnan(%IN1%) "
+                            + "&& isnan(%IN2%))) ? %IN3% : %IN1%;\n";
+
+                case REPLACE_NAN:
+                    return "    T %TMP% = isnan(%IN1%) ? %IN3% : %IN1%;\n";
+
+                case IFELSE:
+                    return "    T %TMP% = (%IN1% != 0) ? %IN2% : %IN3%;\n";
+
+                case LOOKUP_RC1:
+                    return sparse ?
+                            "    T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, %IN3%-1);\n" :
+                            "    T %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n";
+
+                case LOOKUP_RVECT1:
+                    return "    T[] %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1);\n";
+
+                default:
+                    throw new RuntimeException("Invalid ternary type: "+this.toString());
+            }
+
+        }
+    }
+
+    @Override
+    public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector,
+                              boolean scalarInput) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate() {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(SpoofCellwise.CellType ct) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Unary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Unary.java
new file mode 100644
index 0000000..ed18779
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Unary.java
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.cpp;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision;
+
+public class Unary implements CodeTemplate {
+    @Override
+    public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+        if(isSinglePrecision()) {
+            switch( type ) {
+                case ROW_SUMS:
+                case ROW_SUMSQS:
+                case ROW_MINS:
+                case ROW_MAXS:
+                case ROW_MEANS:
+                case ROW_COUNTNNZS: {
+                    String vectName = StringUtils.capitalize(type.name().substring(4, type.name().length()-1).toLowerCase());
+                    return sparse ? "    T %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, alen, len);\n":
+                        "    T %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1%, %POS1%, %LEN%);\n";
+                }
+
+                case VECT_EXP:
+                case VECT_POW2:
+                case VECT_MULT2:
+                case VECT_SQRT:
+                case VECT_LOG:
+                case VECT_ABS:
+                case VECT_ROUND:
+                case VECT_CEIL:
+                case VECT_FLOOR:
+                case VECT_SIGN:
+                case VECT_SIN:
+                case VECT_COS:
+                case VECT_TAN:
+                case VECT_ASIN:
+                case VECT_ACOS:
+                case VECT_ATAN:
+                case VECT_SINH:
+                case VECT_COSH:
+                case VECT_TANH:
+                case VECT_CUMSUM:
+                case VECT_CUMMIN:
+                case VECT_CUMMAX:
+                case VECT_SPROP:
+                case VECT_SIGMOID: {
+                    String vectName = type.getVectorPrimitiveName();
+                    return sparse ? "    T[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, alen, len);\n" :
+                        "    T[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %POS1%, %LEN%);\n";
+                }
+
+                case EXP:
+                    return "    T %TMP% = expf(%IN1%);\n";
+                case LOOKUP_R:
+                    return sparse ?
+                        "    T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, 0);\n" :
+                        "    T %TMP% = getValue(%IN1%, rix);\n";
+                case LOOKUP_C:
+                    return "    T %TMP% = getValue(%IN1%, n, 0, cix);\n";
+                case LOOKUP_RC:
+                    return "    T %TMP% = getValue(%IN1%, n, rix, cix);\n";
+                case LOOKUP0:
+                    return "    T %TMP% = %IN1%[0];\n";
+                case POW2:
+                    return "    T %TMP% = %IN1% * %IN1%;\n";
+                case MULT2:
+                    return "    T %TMP% = %IN1% + %IN1%;\n";
+                case ABS:
+                    return "    T %TMP% = fabsf(%IN1%);\n";
+                case SIN:
+                    return "    T %TMP% = sinf(%IN1%);\n";
+                case COS:
+                    return "    T %TMP% = cosf(%IN1%);\n";
+                case TAN:
+                    return "    T %TMP% = tanf(%IN1%);\n";
+                case ASIN:
+                    return "    T %TMP% = asinf(%IN1%);\n";
+                case ACOS:
+                    return "    T %TMP% = acosf(%IN1%);\n";
+                case ATAN:
+                    return "    T %TMP% = atanf(%IN1%);\n";
+                case SINH:
+                    return "    T %TMP% = sinhf(%IN1%);\n";
+                case COSH:
+                    return "    T %TMP% = coshf(%IN1%);\n";
+                case TANH:
+                    return "    T %TMP% = tanhf(%IN1%);\n";
+                case SIGN:
+                    return "    T %TMP% = signbit(%IN1%) == 0 ? 1.0f : -1.0f;\n";
+                case SQRT:
+                    return "    T %TMP% = sqrtf(%IN1%);\n";
+                case LOG:
+                    return "    T %TMP% = logf(%IN1%);\n";
+                case ROUND:
+                    return "    T %TMP% = roundf(%IN1%);\n";
+                case CEIL:
+                    return "    T %TMP% = ceilf(%IN1%);\n";
+                case FLOOR:
+                    return "    T %TMP% = floorf(%IN1%);\n";
+                case SPROP:
+                    return "    T %TMP% = %IN1% * (1 - %IN1%);\n";
+                case SIGMOID:
+                    return "    T %TMP% = 1 / (1 + expf(-%IN1%));\n";
+                case LOG_NZ:
+                    return "    T %TMP% = (%IN1%==0) ? 0 : logf(%IN1%);\n";
+
+                default:
+                    throw new RuntimeException("Invalid unary type: "+this.toString());
+            }
+        }
+        else { /* double precision */
+            switch( type ) {
+                case ROW_SUMS:
+                case ROW_SUMSQS:
+                case ROW_MINS:
+                case ROW_MAXS:
+                case ROW_MEANS:
+                case ROW_COUNTNNZS: {
+                    String vectName = StringUtils.capitalize(type.name().substring(4, type.name().length()-1).toLowerCase());
+                    return sparse ? "    T %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, alen, len);\n":
+                        "    T %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1%, %POS1%, %LEN%);\n";
+                }
+
+                case VECT_EXP:
+                case VECT_POW2:
+                case VECT_MULT2:
+                case VECT_SQRT:
+                case VECT_LOG:
+                case VECT_ABS:
+                case VECT_ROUND:
+                case VECT_CEIL:
+                case VECT_FLOOR:
+                case VECT_SIGN:
+                case VECT_SIN:
+                case VECT_COS:
+                case VECT_TAN:
+                case VECT_ASIN:
+                case VECT_ACOS:
+                case VECT_ATAN:
+                case VECT_SINH:
+                case VECT_COSH:
+                case VECT_TANH:
+                case VECT_CUMSUM:
+                case VECT_CUMMIN:
+                case VECT_CUMMAX:
+                case VECT_SPROP:
+                case VECT_SIGMOID: {
+                    String vectName = type.getVectorPrimitiveName();
+                    return sparse ? "    T[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, alen, len);\n" :
+                        "    T[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %POS1%, %LEN%);\n";
+                }
+
+                case EXP:
+                    return "    T %TMP% = exp(%IN1%);\n";
+                case LOOKUP_R:
+                    return sparse ?
+                        "    T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, 0);\n" :
+                        "    T %TMP% = getValue(%IN1%, rix);\n";
+                case LOOKUP_C:
+                    return "    T %TMP% = getValue(%IN1%, n, 0, cix);\n";
+                case LOOKUP_RC:
+                    return "    T %TMP% = getValue(%IN1%, n, rix, cix);\n";
+                case LOOKUP0:
+                    return "    T %TMP% = %IN1%[0];\n";
+                case POW2:
+                    return "    T %TMP% = %IN1% * %IN1%;\n";
+                case MULT2:
+                    return "    T %TMP% = %IN1% + %IN1%;\n";
+                case ABS:
+                    return "    T %TMP% = fabs(%IN1%);\n";
+                case SIN:
+                    return "    T %TMP% = sin(%IN1%);\n";
+                case COS:
+                    return "    T %TMP% = cos(%IN1%);\n";
+                case TAN:
+                    return "    T %TMP% = tan(%IN1%);\n";
+                case ASIN:
+                    return "    T %TMP% = asin(%IN1%);\n";
+                case ACOS:
+                    return "    T %TMP% = acos(%IN1%);\n";
+                case ATAN:
+                    return "    T %TMP% = atan(%IN1%);\n";
+                case SINH:
+                    return "    T %TMP% = sinh(%IN1%);\n";
+                case COSH:
+                    return "    T %TMP% = cosh(%IN1%);\n";
+                case TANH:
+                    return "    T %TMP% = tanh(%IN1%);\n";
+                case SIGN:
+                    return "    T %TMP% = signbit(%IN1%) == 0 ? 1.0f : -1.0f;\n";
+                case SQRT:
+                    return "    T %TMP% = sqrt(%IN1%);\n";
+                case LOG:
+                    return "    T %TMP% = log(%IN1%);\n";
+                case ROUND:
+                    return "    T %TMP% = round(%IN1%);\n";
+                case CEIL:
+                    return "    T %TMP% = ceil(%IN1%);\n";
+                case FLOOR:
+                    return "    T %TMP% = floor(%IN1%);\n";
+                case SPROP:
+                    return "    T %TMP% = %IN1% * (1 - %IN1%);\n";
+                case SIGMOID:
+                    return "    T %TMP% = 1 / (1 + exp(-%IN1%));\n";
+                case LOG_NZ:
+                    return "    T %TMP% = (%IN1%==0) ? 0 : log(%IN1%);\n";
+
+                default:
+                    throw new RuntimeException("Invalid unary type: "+this.toString());
+            }
+
+        }
+    }
+
+    @Override
+    public String getTemplate() {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(SpoofCellwise.CellType ct) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Binary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Binary.java
new file mode 100644
index 0000000..39b0f6f
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Binary.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.java;
+
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary.BinType;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+public class Binary implements CodeTemplate {
+    @Override
+    public String getTemplate(BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector,
+                              boolean scalarInput) {
+
+        switch (type) {
+            case DOT_PRODUCT:
+                return sparseLhs ? "    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" :
+                        "    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+            case VECT_MATRIXMULT:
+                return sparseLhs ? "    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" :
+                        "    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+            case VECT_OUTERMULT_ADD:
+                return  sparseLhs ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
+                        sparseRhs ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2v%, %OUT%, %POS1%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
+                                "    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
+
+            //vector-scalar-add operations
+            case VECT_MULT_ADD:
+            case VECT_DIV_ADD:
+            case VECT_MINUS_ADD:
+            case VECT_PLUS_ADD:
+            case VECT_POW_ADD:
+            case VECT_XOR_ADD:
+            case VECT_MIN_ADD:
+            case VECT_MAX_ADD:
+            case VECT_EQUAL_ADD:
+            case VECT_NOTEQUAL_ADD:
+            case VECT_LESS_ADD:
+            case VECT_LESSEQUAL_ADD:
+            case VECT_GREATER_ADD:
+            case VECT_GREATEREQUAL_ADD:
+            case VECT_CBIND_ADD: {
+                String vectName = type.getVectorPrimitiveName();
+                if( scalarVector )
+                    return sparseLhs ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" :
+                            "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
+                else
+                    return sparseLhs ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" :
+                            "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
+            }
+
+            //vector-scalar operations
+            case VECT_MULT_SCALAR:
+            case VECT_DIV_SCALAR:
+            case VECT_MINUS_SCALAR:
+            case VECT_PLUS_SCALAR:
+            case VECT_POW_SCALAR:
+            case VECT_XOR_SCALAR:
+            case VECT_BITWAND_SCALAR:
+            case VECT_MIN_SCALAR:
+            case VECT_MAX_SCALAR:
+            case VECT_EQUAL_SCALAR:
+            case VECT_NOTEQUAL_SCALAR:
+            case VECT_LESS_SCALAR:
+            case VECT_LESSEQUAL_SCALAR:
+            case VECT_GREATER_SCALAR:
+            case VECT_GREATEREQUAL_SCALAR: {
+                String vectName = type.getVectorPrimitiveName();
+                if( scalarVector )
+                    return sparseRhs ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" :
+                            "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS2%, %LEN%);\n";
+                else
+                    return sparseLhs ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" :
+                            "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
+            }
+
+            case VECT_CBIND:
+                if( scalarInput )
+                    return  "    double[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%);\n";
+                else
+                    return sparseLhs ?
+                            "    double[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" :
+                            "    double[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%, %POS1%, %LEN%);\n";
+
+                //vector-vector operations
+            case VECT_MULT:
+            case VECT_DIV:
+            case VECT_MINUS:
+            case VECT_PLUS:
+            case VECT_XOR:
+            case VECT_BITWAND:
+            case VECT_BIASADD:
+            case VECT_BIASMULT:
+            case VECT_MIN:
+            case VECT_MAX:
+            case VECT_EQUAL:
+            case VECT_NOTEQUAL:
+            case VECT_LESS:
+            case VECT_LESSEQUAL:
+            case VECT_GREATER:
+            case VECT_GREATEREQUAL: {
+                String vectName = type.getVectorPrimitiveName();
+                return sparseLhs ?
+                        "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN%);\n" :
+                        sparseRhs ?
+                                "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %POS1%, %IN2i%, %POS2%, alen, %LEN%);\n" :
+                                "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+            }
+
+            //scalar-scalar operations
+            case MULT:
+                return "    double %TMP% = %IN1% * %IN2%;\n";
+
+            case DIV:
+                return "    double %TMP% = %IN1% / %IN2%;\n";
+            case PLUS:
+                return "    double %TMP% = %IN1% + %IN2%;\n";
+            case MINUS:
+                return "    double %TMP% = %IN1% - %IN2%;\n";
+            case MODULUS:
+                return "    double %TMP% = LibSpoofPrimitives.mod(%IN1%, %IN2%);\n";
+            case INTDIV:
+                return "    double %TMP% = LibSpoofPrimitives.intDiv(%IN1%, %IN2%);\n";
+            case LESS:
+                return "    double %TMP% = (%IN1% < %IN2%) ? 1 : 0;\n";
+            case LESSEQUAL:
+                return "    double %TMP% = (%IN1% <= %IN2%) ? 1 : 0;\n";
+            case GREATER:
+                return "    double %TMP% = (%IN1% > %IN2%) ? 1 : 0;\n";
+            case GREATEREQUAL:
+                return "    double %TMP% = (%IN1% >= %IN2%) ? 1 : 0;\n";
+            case EQUAL:
+                return "    double %TMP% = (%IN1% == %IN2%) ? 1 : 0;\n";
+            case NOTEQUAL:
+                return "    double %TMP% = (%IN1% != %IN2%) ? 1 : 0;\n";
+
+            case MIN:
+                return "    double %TMP% = Math.min(%IN1%, %IN2%);\n";
+            case MAX:
+                return "    double %TMP% = Math.max(%IN1%, %IN2%);\n";
+            case LOG:
+                return "    double %TMP% = Math.log(%IN1%)/Math.log(%IN2%);\n";
+            case LOG_NZ:
+                return "    double %TMP% = (%IN1% == 0) ? 0 : Math.log(%IN1%)/Math.log(%IN2%);\n";
+            case POW:
+                return "    double %TMP% = Math.pow(%IN1%, %IN2%);\n";
+            case MINUS1_MULT:
+                return "    double %TMP% = 1 - %IN1% * %IN2%;\n";
+            case MINUS_NZ:
+                return "    double %TMP% = (%IN1% != 0) ? %IN1% - %IN2% : 0;\n";
+            case XOR:
+                return "    double %TMP% = ( (%IN1% != 0) != (%IN2% != 0) ) ? 1 : 0;\n";
+            case BITWAND:
+                return "    double %TMP% = LibSpoofPrimitives.bwAnd(%IN1%, %IN2%);\n";
+            case SEQ_RIX:
+                return "    double %TMP% = %IN1% + grix * %IN2%;\n"; //0-based global rix
+
+            default:
+                throw new RuntimeException("Invalid binary type: "+this.toString());
+        }
+    }
+
+    @Override
+    public String getTemplate() {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(SpoofCellwise.CellType ct) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/CellWise.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/CellWise.java
new file mode 100644
index 0000000..319c872
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/CellWise.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.java;
+
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+public class CellWise implements CodeTemplate {
+    public static final String TEMPLATE =
+            "package codegen;\n"
+                    + "import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;\n"
+                    + "import org.apache.sysds.runtime.codegen.SpoofCellwise;\n"
+                    + "import org.apache.sysds.runtime.codegen.SpoofCellwise.AggOp;\n"
+                    + "import org.apache.sysds.runtime.codegen.SpoofCellwise.CellType;\n"
+                    + "import org.apache.sysds.runtime.codegen.SpoofOperator.SideInput;\n"
+                    + "import org.apache.commons.math3.util.FastMath;\n"
+                    + "\n"
+                    + "public final class %TMP% extends SpoofCellwise {\n"
+                    + "  public %TMP%() {\n"
+                    + "    super(CellType.%TYPE%, %SPARSE_SAFE%, %SEQ%, %AGG_OP_NAME%);\n"
+                    + "  }\n"
+                    + "  protected double genexec(double a, SideInput[] b, double[] scalars, int m, int n, long grix, int rix, int cix) { \n"
+                    + "%BODY_dense%"
+                    + "    return %OUT%;\n"
+                    + "  }\n"
+                    + "}\n";
+
+    @Override
+    public String getTemplate() {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(SpoofCellwise.CellType ct) {
+        switch(ct) {
+            case NO_AGG:
+            case FULL_AGG:
+            case ROW_AGG:
+            case COL_AGG:
+            default:
+                return TEMPLATE;
+        }
+    }
+
+    @Override
+    public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Ternary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Ternary.java
new file mode 100644
index 0000000..af48d05
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Ternary.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.java;
+
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+public class Ternary implements CodeTemplate {
+
+    @Override
+    public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+        switch (type) {
+            case PLUS_MULT:
+                return "    double %TMP% = %IN1% + %IN2% * %IN3%;\n";
+
+            case MINUS_MULT:
+                return "    double %TMP% = %IN1% - %IN2% * %IN3%;\n";
+
+            case BIASADD:
+                return "    double %TMP% = %IN1% + getValue(%IN2%, cix/%IN3%);\n";
+
+            case BIASMULT:
+                return "    double %TMP% = %IN1% * getValue(%IN2%, cix/%IN3%);\n";
+
+            case REPLACE:
+                return "    double %TMP% = (%IN1% == %IN2% || (Double.isNaN(%IN1%) "
+                        + "&& Double.isNaN(%IN2%))) ? %IN3% : %IN1%;\n";
+
+            case REPLACE_NAN:
+                return "    double %TMP% = Double.isNaN(%IN1%) ? %IN3% : %IN1%;\n";
+
+            case IFELSE:
+                return "    double %TMP% = (%IN1% != 0) ? %IN2% : %IN3%;\n";
+
+            case LOOKUP_RC1:
+                return sparse ?
+                        "    double %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, %IN3%-1);\n" :
+                        "    double %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n";
+
+            case LOOKUP_RVECT1:
+                return "    double[] %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1);\n";
+
+            default:
+                throw new RuntimeException("Invalid ternary type: "+this.toString());
+        }
+    }
+
+    @Override
+    public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector,
+                              boolean scalarInput) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate() {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(SpoofCellwise.CellType ct) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Unary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Unary.java
new file mode 100644
index 0000000..7071e08
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Unary.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.java;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary.UnaryType;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+public class Unary implements CodeTemplate {
+    @Override
+    public String getTemplate(UnaryType type, boolean sparse) {
+        switch( type ) {
+            case ROW_SUMS:
+            case ROW_SUMSQS:
+            case ROW_MINS:
+            case ROW_MAXS:
+            case ROW_MEANS:
+            case ROW_COUNTNNZS: {
+                String vectName = StringUtils.capitalize(type.name().substring(4, type.name().length()-1).toLowerCase());
+                return sparse ? "    double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, alen, len);\n":
+                        "    double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1%, %POS1%, %LEN%);\n";
+            }
+
+            case VECT_EXP:
+            case VECT_POW2:
+            case VECT_MULT2:
+            case VECT_SQRT:
+            case VECT_LOG:
+            case VECT_ABS:
+            case VECT_ROUND:
+            case VECT_CEIL:
+            case VECT_FLOOR:
+            case VECT_SIGN:
+            case VECT_SIN:
+            case VECT_COS:
+            case VECT_TAN:
+            case VECT_ASIN:
+            case VECT_ACOS:
+            case VECT_ATAN:
+            case VECT_SINH:
+            case VECT_COSH:
+            case VECT_TANH:
+            case VECT_CUMSUM:
+            case VECT_CUMMIN:
+            case VECT_CUMMAX:
+            case VECT_SPROP:
+            case VECT_SIGMOID: {
+                String vectName = type.getVectorPrimitiveName();
+                return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, alen, len);\n" :
+                        "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %POS1%, %LEN%);\n";
+            }
+
+            case EXP:
+                return "    double %TMP% = FastMath.exp(%IN1%);\n";
+            case LOOKUP_R:
+                return sparse ?
+                        "    double %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, 0);\n" :
+                        "    double %TMP% = getValue(%IN1%, rix);\n";
+            case LOOKUP_C:
+                return "    double %TMP% = getValue(%IN1%, n, 0, cix);\n";
+            case LOOKUP_RC:
+                return "    double %TMP% = getValue(%IN1%, n, rix, cix);\n";
+            case LOOKUP0:
+                return "    double %TMP% = %IN1%[0];\n";
+            case POW2:
+                return "    double %TMP% = %IN1% * %IN1%;\n";
+            case MULT2:
+                return "    double %TMP% = %IN1% + %IN1%;\n";
+            case ABS:
+                return "    double %TMP% = Math.abs(%IN1%);\n";
+            case SIN:
+                return "    double %TMP% = FastMath.sin(%IN1%);\n";
+            case COS:
+                return "    double %TMP% = FastMath.cos(%IN1%);\n";
+            case TAN:
+                return "    double %TMP% = FastMath.tan(%IN1%);\n";
+            case ASIN:
+                return "    double %TMP% = FastMath.asin(%IN1%);\n";
+            case ACOS:
+                return "    double %TMP% = FastMath.acos(%IN1%);\n";
+            case ATAN:
+                return "    double %TMP% = Math.atan(%IN1%);\n";
+            case SINH:
+                return "    double %TMP% = FastMath.sinh(%IN1%);\n";
+            case COSH:
+                return "    double %TMP% = FastMath.cosh(%IN1%);\n";
+            case TANH:
+                return "    double %TMP% = FastMath.tanh(%IN1%);\n";
+            case SIGN:
+                return "    double %TMP% = FastMath.signum(%IN1%);\n";
+            case SQRT:
+                return "    double %TMP% = Math.sqrt(%IN1%);\n";
+            case LOG:
+                return "    double %TMP% = Math.log(%IN1%);\n";
+            case ROUND:
+                return "    double %TMP% = Math.round(%IN1%);\n";
+            case CEIL:
+                return "    double %TMP% = FastMath.ceil(%IN1%);\n";
+            case FLOOR:
+                return "    double %TMP% = FastMath.floor(%IN1%);\n";
+            case SPROP:
+                return "    double %TMP% = %IN1% * (1 - %IN1%);\n";
+            case SIGMOID:
+                return "    double %TMP% = 1 / (1 + FastMath.exp(-%IN1%));\n";
+            case LOG_NZ:
+                return "    double %TMP% = (%IN1%==0) ? 0 : Math.log(%IN1%);\n";
+
+            default:
+                throw new RuntimeException("Invalid unary type: "+this.toString());
+        }
+    }
+
+    @Override
+    public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate() {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(SpoofCellwise.CellType ct) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+
+    @Override
+    public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+        throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+    }
+}
diff --git a/src/main/java/org/apache/sysds/lops/SpoofFused.java b/src/main/java/org/apache/sysds/lops/SpoofFused.java
index bd605f8..0795f8f 100644
--- a/src/main/java/org/apache/sysds/lops/SpoofFused.java
+++ b/src/main/java/org/apache/sysds/lops/SpoofFused.java
@@ -21,7 +21,8 @@ package org.apache.sysds.lops;
 
 import java.util.ArrayList;
 
- 
+
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
 import org.apache.sysds.lops.LopProperties.ExecType;
 
 import org.apache.sysds.common.Types.DataType;
@@ -31,12 +32,17 @@ public class SpoofFused extends Lop
 {
 	private final Class<?> _class;
 	private final int _numThreads;
-	
-	public SpoofFused( ArrayList<Lop> inputs, DataType dt, ValueType vt, Class<?> cla, int k, ExecType etype) {
+	private final String _genVarName;
+
+	private GeneratorAPI _api;
+	public SpoofFused(ArrayList<Lop> inputs, DataType dt, ValueType vt, Class<?> cla, GeneratorAPI api,
+					  String genVarName, int k, ExecType etype) {
 		super(Type.SpoofFused, dt, vt);
 		_class = cla;
 		_numThreads = k;
-		
+		_api = api;
+		_genVarName = genVarName;
+
 		for( Lop lop : inputs ) {
 			addInput(lop);
 			lop.addOutput(this);
@@ -47,7 +53,11 @@ public class SpoofFused extends Lop
 
 	@Override
 	public String toString() {
-		return "spoof("+_class.getSimpleName()+")";
+		if(_class != null)
+			return "spoof("+_class.getSimpleName()+")";
+		else
+			return "spoof(" + _genVarName + ")";
+
 	}
 
 	@Override
@@ -98,8 +108,14 @@ public class SpoofFused extends Lop
 		sb.append( "spoof" );
 
 		sb.append( OPERAND_DELIMITOR );
-		sb.append( _class.getName() );
-		
+		sb.append( _api);
+		sb.append( OPERAND_DELIMITOR );
+		if(_class != null)
+			sb.append( _class.getName() );
+		else
+			sb.append("codegen." + _genVarName);
+
+
 		for(int i=0; i < inputs.length; i++) {
 			sb.append( OPERAND_DELIMITOR );
 			sb.append( getInputs().get(i).prepInputOperand(inputs[i]));
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/CodegenUtils.java b/src/main/java/org/apache/sysds/runtime/codegen/CodegenUtils.java
index 64ef9f8..d8d3d2b 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/CodegenUtils.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/CodegenUtils.java
@@ -63,7 +63,9 @@ public class CodegenUtils
 	
 	//janino-specific map of source code transfer/recompile on-demand
 	private static ConcurrentHashMap<String, String> _src = new ConcurrentHashMap<>();
-	
+
+	private static ConcurrentHashMap<String, SpoofCUDA> _native_op_data = new ConcurrentHashMap<>();
+
 	//javac-specific working directory for src/class files
 	private static String _workingDir = null;
 	
@@ -156,7 +158,15 @@ public class CodegenUtils
 		
 		return ret;
 	}
-	
+
+	public static SpoofCUDA getNativeOpData(String name) {
+		return _native_op_data.get(name);
+	}
+
+	public static void putNativeOpData(SpoofCUDA op) {
+		_native_op_data.put(op.getName(), op);
+	}
+
 	public static SideInput createSideInput(MatrixBlock in) {
 		SideInput ret = (in.isInSparseFormat() || !in.isAllocated()) ?
 			new SideInput(null, in, in.getNumColumns()) :
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java
new file mode 100644
index 0000000..ac783c6
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.codegen;
+
+import java.util.ArrayList;
+
+import org.apache.sysds.hops.codegen.SpoofCompiler;
+import org.apache.sysds.hops.codegen.cplan.CNodeCell;
+import org.apache.sysds.hops.codegen.cplan.CNodeMultiAgg;
+import org.apache.sysds.hops.codegen.cplan.CNodeOuterProduct;
+import org.apache.sysds.hops.codegen.cplan.CNodeRow;
+import org.apache.sysds.hops.codegen.cplan.CNodeTpl;
+import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysds.runtime.instructions.cp.ScalarObject;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision;
+
+public class SpoofCUDA extends SpoofOperator {
+
+    private final CNodeTpl cnt;
+    public final String name;
+
+    public SpoofCUDA(CNodeTpl cnode) {
+        name = "codegen." + cnode.getVarname();
+        cnt = cnode;
+    }
+
+    public String getName() {
+        return name;
+    }
+
+    public CNodeTpl getCNodeTemplate() {
+        return cnt;
+    }
+
+    public String getSpoofTemplateType() {
+        if (cnt instanceof CNodeCell)
+            return "CW";
+        else if(cnt instanceof CNodeRow)
+            return "RA";
+        else if(cnt instanceof CNodeMultiAgg)
+            return "MA";
+        else if(cnt instanceof CNodeOuterProduct)
+            return "OP";
+        else
+            throw new RuntimeException("unknown spoof operator type");
+    }
+    @Override
+    public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out) {
+        throw new RuntimeException("method not implemented for SpoofNativeCUDA");
+    }
+
+    public double execute(ArrayList<MatrixObject> inputs, ArrayList<ScalarObject> scalarObjects, MatrixObject out_obj,
+                               ExecutionContext ec) {
+        double ret = 0;
+        long out_ptr = 0;
+
+        if(out_obj != null)
+            out_ptr = ec.getGPUPointerAddress(out_obj);
+
+        int offset = 1;
+        if(cnt instanceof CNodeOuterProduct)
+            offset = 2;
+
+        // only dense input preparation for now
+        long[] in_ptrs = new long[offset];
+        for(int i = 0; i < offset; ++i)
+            in_ptrs[i] = ec.getGPUPointerAddress(inputs.get(i));
+
+        long[] side_ptrs = new long[inputs.size() - offset];
+        for(int i = offset; i < inputs.size(); ++i)
+            side_ptrs[i - offset] = ec.getGPUPointerAddress(inputs.get(i));
+
+        if(isSinglePrecision()) {
+            float[] scalars = prepInputScalarsFloat(scalarObjects);
+
+            // ToDo: handle float
+           ret = execute_f(SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA), name.split("\\.")[1],
+                    in_ptrs, side_ptrs, out_ptr, scalars, inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), 0);
+
+        }
+        else {
+            double[] scalars = prepInputScalars(scalarObjects);
+
+            ret = execute_d(SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA), name.split("\\.")[1],
+                    in_ptrs, side_ptrs, out_ptr, scalars, inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), 0);
+        }
+        return ret;
+    }
+
+    @Override
+    public String getSpoofType() {
+        String tmp[] = getClass().getName().split("\\.");
+            return  tmp[tmp.length-1] + "_" + getSpoofTemplateType() + "_" + name.split("\\.")[1];
+    }
+
+    private native float execute_f(long ctx, String name, long[] in_ptr, long[] side_ptr,
+                                   long out_ptr, float[] scalars, long m, long n, long grix);
+
+    private native double execute_d(long ctx, String name, long[] in_ptr, long[] side_ptr,
+                                    long out_ptr, double[] scalars, long m, long n, long grix);
+}
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
index 87ddfea..3088e84 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
@@ -145,7 +145,14 @@ public abstract class SpoofOperator implements Serializable
 			scalars[i] = scalarObjects.get(i).getDoubleValue();
 		return scalars;
 	}
-	
+
+	protected static float[] prepInputScalarsFloat(ArrayList<ScalarObject> scalarObjects) {
+		float[] scalars = new float[scalarObjects.size()];
+		for(int i=0; i < scalarObjects.size(); i++)
+			scalars[i] = (float)scalarObjects.get(i).getDoubleValue();
+		return scalars;
+	}
+
 	public static long getTotalInputNnz(ArrayList<MatrixBlock> inputs) {
 		return inputs.stream().mapToLong(in -> in.getNonZeros()).sum();
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
index a34b77e..fceaea4 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
@@ -72,7 +72,7 @@ public class ExecutionContext {
 	protected LocalVariableMap _variables;
 	protected long _tid = -1;
 	protected boolean _autoCreateVars;
-	
+
 	//lineage map, cache, prepared dedup blocks
 	protected Lineage _lineage;
 
@@ -124,19 +124,19 @@ public class ExecutionContext {
 	public void setLineage(Lineage lineage) {
 		_lineage = lineage;
 	}
-	
+
 	public boolean isAutoCreateVars() {
 		return _autoCreateVars;
 	}
-	
+
 	public void setAutoCreateVars(boolean flag) {
 		_autoCreateVars = flag;
 	}
-	
+
 	public void setTID(long tid) {
 		_tid = tid;
 	}
-	
+
 	public long getTID() {
 		return _tid;
 	}
@@ -406,6 +406,14 @@ public class ExecutionContext {
 		return mo;
 	}
 
+	public long getGPUPointerAddress(MatrixObject obj) {
+
+			if(obj.getGPUObject(getGPUContext(0)) == null)
+				return 0;
+			else
+				return obj.getGPUObject(getGPUContext(0)).getPointerAddress();
+	}
+
 	public MatrixObject getMatrixInputForGPUInstruction(String varName, String opcode) {
 		GPUContext gCtx = getGPUContext(0);
 		MatrixObject mo = getMatrixObject(varName);
@@ -568,9 +576,9 @@ public class ExecutionContext {
 			return createFrameObject((FrameBlock) cb);
 		return null;
 	}
-	
+
 	public static MatrixObject createMatrixObject(MatrixBlock mb) {
-		MatrixObject ret = new MatrixObject(Types.ValueType.FP64, 
+		MatrixObject ret = new MatrixObject(Types.ValueType.FP64,
 			OptimizerUtils.getUniqueTempFileName());
 		ret.acquireModify(mb);
 		ret.setMetaData(new MetaDataFormat(new MatrixCharacteristics(
@@ -580,7 +588,7 @@ public class ExecutionContext {
 		ret.release();
 		return ret;
 	}
-	
+
 	public static FrameObject createFrameObject(FrameBlock fb) {
 		FrameObject ret = new FrameObject(OptimizerUtils.getUniqueTempFileName());
 		ret.acquireModify(fb);
@@ -589,7 +597,7 @@ public class ExecutionContext {
 		ret.release();
 		return ret;
 	}
-	
+
 	public List<MatrixBlock> getMatrixInputs(CPOperand[] inputs) {
 		return getMatrixInputs(inputs, false);
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/functionobjects/IntegerDivide.java b/src/main/java/org/apache/sysds/runtime/functionobjects/IntegerDivide.java
index 9d265f6..a49e0f8 100644
--- a/src/main/java/org/apache/sysds/runtime/functionobjects/IntegerDivide.java
+++ b/src/main/java/org/apache/sysds/runtime/functionobjects/IntegerDivide.java
@@ -52,7 +52,7 @@ public class IntegerDivide extends ValueFunction
 	/**
 	 * NOTE: The R semantics of integer divide a%/%b are to compute the 
 	 * double division and subsequently cast to int. In case of a NaN 
-	 * or +-INFINITY devision result, the overall output is NOT cast to
+	 * or +-INFINITY division result, the overall output is NOT cast to
 	 * int in order to prevent the special double values.
 	 * 
 	 * @param in1 double input 1
@@ -61,7 +61,7 @@ public class IntegerDivide extends ValueFunction
 	 */
 	private static double executeIntDiv( double in1, double in2 )
 	{
-		//compute normal double devision
+		//compute normal double division
 		double ret = in1 / in2;
 		
 		//check for NaN/+-INF intermediate (cast to int would eliminate it)
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysds/runtime/instructions/GPUInstructionParser.java
index e895797..4fca2ad 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/GPUInstructionParser.java
@@ -38,6 +38,7 @@ import org.apache.sysds.runtime.instructions.gpu.MatrixReshapeGPUInstruction;
 import org.apache.sysds.runtime.instructions.gpu.RelationalBinaryGPUInstruction;
 import org.apache.sysds.runtime.instructions.gpu.ReorgGPUInstruction;
 import org.apache.sysds.runtime.instructions.gpu.GPUInstruction.GPUINSTRUCTION_TYPE;
+import org.apache.sysds.runtime.instructions.gpu.SpoofCUDAInstruction;
 
 public class GPUInstructionParser  extends InstructionParser 
 {
@@ -157,7 +158,9 @@ public class GPUInstructionParser  extends InstructionParser
 		String2GPUInstructionType.put( ">="   , GPUINSTRUCTION_TYPE.RelationalBinary);
 		
 		// Indexing 
-		String2GPUInstructionType.put( RightIndex.OPCODE, GPUINSTRUCTION_TYPE.MatrixIndexing); 
+		String2GPUInstructionType.put( RightIndex.OPCODE, GPUINSTRUCTION_TYPE.MatrixIndexing);
+
+		String2GPUInstructionType.put( "spoof"   , GPUINSTRUCTION_TYPE.SpoofFused);
 	}
 	
 	public static GPUInstruction parseSingleInstruction (String str ) {
@@ -217,7 +220,10 @@ public class GPUInstructionParser  extends InstructionParser
 
 			case MatrixIndexing:
 				return MatrixIndexingGPUInstruction.parseInstruction(str);
-				
+
+			case SpoofFused:
+				return SpoofCUDAInstruction.parseInstruction(str);
+
 			default: 
 				throw new DMLRuntimeException("Invalid GPU Instruction Type: " + gputype );
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/SpoofCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/SpoofCPInstruction.java
index e1d12f5..f4e262e 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/SpoofCPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/SpoofCPInstruction.java
@@ -55,11 +55,11 @@ public class SpoofCPInstruction extends ComputationCPInstruction {
 		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
 		
 		ArrayList<CPOperand> inlist = new ArrayList<>();
-		Class<?> cla = CodegenUtils.getClass(parts[1]);
+		Class<?> cla = CodegenUtils.getClass(parts[2]);
 		SpoofOperator op = CodegenUtils.createInstance(cla);
 		String opcode =  parts[0] + op.getSpoofType();
 		
-		for( int i=2; i<parts.length-2; i++ )
+		for( int i=3; i<parts.length-2; i++ )
 			inlist.add(new CPOperand(parts[i]));
 		CPOperand out = new CPOperand(parts[parts.length-2]);
 		int k = Integer.parseInt(parts[parts.length-1]);
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java
index e6f22e3..615e194 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java
@@ -33,7 +33,7 @@ import org.apache.sysds.utils.Statistics;
 
 public abstract class GPUInstruction extends Instruction {
 	private static final Log LOG = LogFactory.getLog(GPUInstruction.class.getName());
-	
+
 	public enum GPUINSTRUCTION_TYPE {
 		AggregateUnary,
 		AggregateBinary,
@@ -47,7 +47,8 @@ public abstract class GPUInstruction extends Instruction {
 		BuiltinUnary,
 		BuiltinBinary,
 		Builtin,
-		MatrixIndexing
+		MatrixIndexing,
+		SpoofFused
 	}
 
 	// Memory/conversions
@@ -159,7 +160,7 @@ public abstract class GPUInstruction extends Instruction {
 		instOpcode = opcode;
 		_requiresLabelUpdate = super.requiresLabelUpdate();
 	}
-	
+
 	@Override
 	public IType getType() {
 		return IType.GPU;
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
new file mode 100644
index 0000000..8049e87
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.instructions.gpu;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.sysds.common.Types;
+import org.apache.sysds.hops.codegen.cplan.CNodeCell;
+import org.apache.sysds.runtime.codegen.CodegenUtils;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+import org.apache.sysds.runtime.codegen.SpoofOperator;
+import org.apache.sysds.runtime.codegen.SpoofCUDA;
+import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysds.runtime.instructions.InstructionUtils;
+import org.apache.sysds.runtime.instructions.cp.CPOperand;
+import org.apache.sysds.runtime.instructions.cp.ScalarObject;
+import org.apache.sysds.runtime.lineage.LineageItem;
+import org.apache.sysds.runtime.lineage.LineageItemUtils;
+import org.apache.sysds.runtime.lineage.LineageTraceable;
+import org.apache.sysds.runtime.instructions.cp.DoubleObject;
+
+import java.util.ArrayList;
+
+public class SpoofCUDAInstruction extends GPUInstruction implements LineageTraceable {
+    private final SpoofCUDA _op;
+    private final CPOperand[] _in;
+
+    public final CPOperand _out;
+
+    private SpoofCUDAInstruction(SpoofOperator op, CPOperand[] in, CPOperand out, String opcode, String istr) {
+        super(null, opcode, istr);
+
+        if(!(op instanceof SpoofCUDA))
+            throw new RuntimeException("SpoofGPUInstruction needs an operator of type SpoofNativeCUDA!");
+
+        _op = (SpoofCUDA) op;
+        _in = in;
+        _out = out;
+        instString = istr;
+        instOpcode = opcode;
+    }
+
+    public static SpoofCUDAInstruction parseInstruction(String str) {
+        String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
+
+        ArrayList<CPOperand> inlist = new ArrayList<>();
+        SpoofCUDA op = CodegenUtils.getNativeOpData(parts[2]);
+        String opcode =  op.getSpoofType();
+
+        for( int i=3; i<parts.length-2; i++ )
+            inlist.add(new CPOperand(parts[i]));
+        CPOperand out = new CPOperand(parts[parts.length-2]);
+
+        return new SpoofCUDAInstruction(op, inlist.toArray(new CPOperand[0]), out, opcode, str);
+    }
+
+    @Override
+    public void processInstruction(ExecutionContext ec) {
+
+        //get input matrices and scalars, incl pinning of matrices
+        ArrayList<MatrixObject> inputs = new ArrayList<>();
+        ArrayList<ScalarObject> scalars = new ArrayList<>();
+        for (CPOperand input : _in) {
+            if(input.getDataType()== Types.DataType.MATRIX)
+                inputs.add(ec.getMatrixInputForGPUInstruction(input.getName(), getExtendedOpcode()));
+            else if(input.getDataType()== Types.DataType.SCALAR) {
+                //note: even if literal, it might be compiled as scalar placeholder
+                scalars.add(ec.getScalarInput(input));
+            }
+        }
+
+        // set the output dimensions to the hop node matrix dimensions
+        if( _out.getDataType() == Types.DataType.MATRIX) {
+            long rows = inputs.get(0).getNumRows();
+            long cols = inputs.get(0).getNumColumns();
+            if(_op.getSpoofTemplateType().contains("CW"))
+                if(((CNodeCell)_op.getCNodeTemplate()).getCellType() == SpoofCellwise.CellType.COL_AGG)
+                    rows = 1;
+                else if(((CNodeCell)_op.getCNodeTemplate()).getCellType() == SpoofCellwise.CellType.ROW_AGG)
+                    cols = 1;
+
+            MatrixObject out_obj = ec.getDenseMatrixOutputForGPUInstruction(_out.getName(), rows, cols).getKey();
+            ec.setMetaData(_out.getName(), out_obj.getNumRows(), out_obj.getNumColumns());
+            _op.execute(inputs, scalars, out_obj, ec);
+            ec.releaseMatrixOutputForGPUInstruction(_out.getName());
+        }
+        else if (_out.getDataType() == Types.DataType.SCALAR) {
+            ScalarObject out = new DoubleObject(_op.execute(inputs, scalars, null, ec));
+            ec.setScalarOutput(_out.getName(), out);
+        }
+
+        for (CPOperand input : _in)
+            if(input.getDataType()== Types.DataType.MATRIX)
+                ec.releaseMatrixInputForGPUInstruction(input.getName());
+    }
+
+    @Override
+    public Pair<String, LineageItem> getLineageItem(ExecutionContext ec) {
+        return Pair.of(_out.getName(),
+                new LineageItem(getOpcode(), LineageItemUtils.getLineage(ec, _in)));
+    }
+}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContextPool.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContextPool.java
index 15a345c..25f3059 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContextPool.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContextPool.java
@@ -21,6 +21,7 @@
 
 import static jcuda.driver.JCudaDriver.cuDeviceGetCount;
 import static jcuda.driver.JCudaDriver.cuInit;
+import static jcuda.runtime.JCuda.cudaGetDevice;
 import static jcuda.runtime.JCuda.cudaGetDeviceProperties;
 
 import java.util.ArrayList;
@@ -83,6 +84,8 @@ public class GPUContextPool {
 	 * All these need be done once, and not per GPU
 	 */
 	public synchronized static void initializeGPU() {
+		if (initialized)
+			return;
 		initialized = true;
 		GPUContext.LOG.info("Initializing CUDA");
 		long start = System.nanoTime();
@@ -210,8 +213,7 @@ public class GPUContextPool {
 	public static synchronized List<GPUContext> reserveAllGPUContexts() {
 		if (reserved)
 			throw new DMLRuntimeException("Trying to re-reserve GPUs");
-		if (!initialized)
-			initializeGPU();
+		initializeGPU();
 		reserved = true;
 		LOG.trace("GPU : Reserved all GPUs");
 		return pool;
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index ad20f46..b2967bd 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -1006,4 +1006,23 @@ public class GPUObject {
 		return sb.toString();
 	}
 
+	private static long getPointerAddress(Pointer p) {
+		// WORKAROUND until a method like CUdeviceptr#getAddress exists in jCuda
+		class PointerWithAddress extends Pointer
+		{
+			PointerWithAddress(Pointer other)
+			{
+				super(other);
+			}
+			long getAddress()
+			{
+				return getNativePointer() + getByteOffset();
+			}
+		}
+		return new PointerWithAddress(p).getAddress();
+	}
+
+	public long getPointerAddress() {
+		return getPointerAddress(getDensePointer());
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/JCudaKernels.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/JCudaKernels.java
index a6cd221..072bfca 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/JCudaKernels.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/JCudaKernels.java
@@ -41,7 +41,7 @@ import jcuda.runtime.JCuda;
 
 public class JCudaKernels {
 
-	private final static String ptxFileName = "/kernels/SystemDS.ptx";
+	private final static String ptxFileName = "/cuda/kernels/SystemDS.ptx";
 	private HashMap<String, CUfunction> kernels = new HashMap<>();
 	private CUmodule module;
 
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/spark/SpoofSPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/spark/SpoofSPInstruction.java
index 27f86c6..6dc2832 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/spark/SpoofSPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/spark/SpoofSPInstruction.java
@@ -87,11 +87,11 @@ public class SpoofSPInstruction extends SPInstruction {
 		
 		//String opcode = parts[0];
 		ArrayList<CPOperand> inlist = new ArrayList<>();
-		Class<?> cls = CodegenUtils.getClass(parts[1]);
-		byte[] classBytes = CodegenUtils.getClassData(parts[1]);
+		Class<?> cls = CodegenUtils.getClass(parts[2]);
+		byte[] classBytes = CodegenUtils.getClassData(parts[2]);
 		String opcode =  parts[0] + CodegenUtils.createInstance(cls).getSpoofType();
 		
-		for( int i=2; i<parts.length-2; i++ )
+		for( int i=3; i<parts.length-2; i++ )
 			inlist.add(new CPOperand(parts[i]));
 		CPOperand out = new CPOperand(parts[parts.length-2]);
 		//note: number of threads parts[parts.length-1] always ignored
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
index cb70436..5d33e2e 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
@@ -295,7 +295,7 @@ public class LibMatrixNative
 		LibMatrixDNN.conv2dBackwardData(filter, dout, outputBlock, params);
 	}
 	
-	private static boolean isSinglePrecision() {
+	public static boolean isSinglePrecision() {
 		return ConfigurationManager.getDMLConfig()
 			.getTextValue(DMLConfig.FLOATING_POINT_PRECISION).equals("single");
 	}
diff --git a/src/main/java/org/apache/sysds/utils/NativeHelper.java b/src/main/java/org/apache/sysds/utils/NativeHelper.java
index 36ec816..cd9f8f9 100644
--- a/src/main/java/org/apache/sysds/utils/NativeHelper.java
+++ b/src/main/java/org/apache/sysds/utils/NativeHelper.java
@@ -293,11 +293,15 @@ public class NativeHelper {
 	 * @param optionalMsg message for debugging
 	 * @return true if successfully loaded BLAS
 	 */
-	private static boolean loadBLAS(String customLibPath, String blas, String optionalMsg) {
+	public static boolean loadBLAS(String customLibPath, String blas, String optionalMsg) {
 		// First attempt to load from custom library path
 		if((customLibPath != null) && (!customLibPath.equalsIgnoreCase("none"))) {
 			String libPath = customLibPath + File.separator + System.mapLibraryName(blas);
 			try {
+				// This fixes libPath if it already contained a prefix/suffix and mapLibraryName added another one.
+				libPath = libPath.replace("liblibsystemds", "libsystemds")
+								 .replace(".dll.dll", ".dll")
+								 .replace(".so.so", ".so");
 				System.load(libPath);
 				LOG.info("Loaded the library:" + libPath);
 				return true;
@@ -328,7 +332,7 @@ public class NativeHelper {
 	 * @param libFileName library file name)
 	 * @return true if successfully loaded BLAS
 	 */
-	private static boolean loadLibraryHelperFromResource(String libFileName)  {
+	public static boolean loadLibraryHelperFromResource(String libFileName)  {
 		OutputStream out = null;
 		try(InputStream in = NativeHelper.class.getResourceAsStream("/lib/"+ libFileName)) {
 			// This logic is added because Java does not allow to load library from a resource file.
diff --git a/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java b/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
index 65093e2..6f887ce 100644
--- a/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
@@ -485,8 +485,8 @@ public class CellwiseTmplTest extends AutomatedTestBase
 			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
 
 			runTest(true, false, null, -1); 
-			runRScript(true); 
-			
+			runRScript(true);
+
 			if(testname.equals(TEST_NAME6) || testname.equals(TEST_NAME7) 
 				|| testname.equals(TEST_NAME9) || testname.equals(TEST_NAME10)) {
 				//compare scalars 
@@ -504,7 +504,7 @@ public class CellwiseTmplTest extends AutomatedTestBase
 			if( !(rewrites && (testname.equals(TEST_NAME2)
 				|| testname.equals(TEST_NAME19))) && !testname.equals(TEST_NAME27) )
 				Assert.assertTrue(heavyHittersContainsSubString(
-						"spoofCell", "sp_spoofCell", "spoofMA", "sp_spoofMA"));
+						"spoofCell", "sp_spoofCell", "spoofMA", "sp_spoofMA", "gpu_SpoofCUDA_CW_"));
 			if( testname.equals(TEST_NAME7) ) //ensure matrix mult is fused
 				Assert.assertTrue(!heavyHittersContainsSubString("tsmm"));
 			else if( testname.equals(TEST_NAME10) ) //ensure min/max is fused
diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties
index d1f4e58..b479997 100644
--- a/src/test/resources/log4j.properties
+++ b/src/test/resources/log4j.properties
@@ -25,6 +25,7 @@ log4j.logger.org.apache.sysds.api.DMLScript=OFF
 log4j.logger.org.apache.sysds.test=INFO
 log4j.logger.org.apache.sysds.test.AutomatedTestBase=ERROR
 log4j.logger.org.apache.sysds=WARN
+#log4j.logger.org.apache.sysds.hops.codegen.SpoofCompiler=TRACE
 log4j.logger.org.apache.sysds.runtime.compress.AbstractCompressedMatrixBlock=ERROR
 # log4j.logger.org.apache.sysds.runtime.compress.CompressedMatrixBlockFactory=DEBUG
 # log4j.logger.org.apache.sysds.runtime.compress.cocode=DEBUG
diff --git a/src/test/scripts/functions/codegen/SystemDS-config-codegen.xml b/src/test/scripts/functions/codegen/SystemDS-config-codegen.xml
index 1becb67..f77d94d 100644
--- a/src/test/scripts/functions/codegen/SystemDS-config-codegen.xml
+++ b/src/test/scripts/functions/codegen/SystemDS-config-codegen.xml
@@ -27,4 +27,6 @@
 
    <!-- The number of theads for the spark instance artificially selected-->
    <sysds.local.spark.number.threads>16</sysds.local.spark.number.threads>
+
+   <sysds.codegen.api>auto</sysds.codegen.api>
 </root>
\ No newline at end of file