You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ma...@apache.org on 2020/11/07 23:49:00 UTC
[systemds] branch master updated: [SYSTEMDS-2691, 2692, 2698]
Initial SPOOF CUDA; Refactoring of CUDA kernels
This is an automated email from the ASF dual-hosted git repository.
markd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 52f2b3e [SYSTEMDS-2691, 2692, 2698] Initial SPOOF CUDA; Refactoring of CUDA kernels
52f2b3e is described below
commit 52f2b3e27cc012272fe2ead3a3d0cde49983d4fb
Author: Mark Dokter <ma...@dokter.cc>
AuthorDate: Sun Nov 8 00:47:15 2020 +0100
[SYSTEMDS-2691, 2692, 2698] Initial SPOOF CUDA; Refactoring of CUDA kernels
Changelog from squashed commits:
* Refactor CUDA codebase to its own directory
* JNI parts for CUDA codegen and more code reorganization
* SpoofCompiler CUDA loading and compiler invocation
* Cellwise code template and other CPlan templates for operators (*nary/data/...)
* SpoofCUDA runtime instruction and also:
* Import jitify as submodule
* Template loading from JAR file
* Configuration tag <sysds.codegen.api> to use CUDA codegen (see SystemDS-config.xml.template)
Closes #1092
---
.gitmodules | 3 +
conf/SystemDS-config.xml.template | 9 +-
pom.xml | 33 +-
src/main/cpp/kernels/Makefile | 40 -
.../cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so | Bin 0 -> 1008760 bytes
.../lib/libsystemds_spoof_cuda-Windows-AMD64.dll | Bin 0 -> 220672 bytes
src/main/cuda/CMakeLists.txt | 111 +
src/main/cuda/ext/jitify | 1 +
src/main/{cpp/kernels => cuda/headers}/agg_ops.cuh | 40 +-
src/main/{cpp/kernels => cuda/headers}/cum_max.cuh | 9 +-
src/main/{cpp/kernels => cuda/headers}/cum_min.cuh | 7 +-
.../{cpp/kernels => cuda/headers}/cum_prod.cuh | 7 +-
.../{cpp/kernels => cuda/headers}/cum_scan.cuh | 10 +-
src/main/{cpp/kernels => cuda/headers}/cum_sum.cuh | 7 +-
.../{cpp/kernels => cuda/headers}/cum_sum_prod.cuh | 8 +-
src/main/cuda/headers/reduction.cuh | 314 ++
src/main/cuda/headers/spoof_utils.cuh | 94 +
src/main/{cpp/kernels => cuda/headers}/utils.cuh | 23 +-
src/main/{cpp => cuda}/kernels/SystemDS.cu | 17 -
src/main/{cpp => cuda}/kernels/SystemDS.ptx | 0
src/main/cuda/kernels/reduction.cu | 282 ++
src/main/cuda/kernels/reduction.ptx | 3546 ++++++++++++++++++++
src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp | 141 +
src/main/cuda/spoof-launcher/SpoofCUDAContext.h | 269 ++
src/main/cuda/spoof-launcher/host_utils.h | 48 +
src/main/cuda/spoof-launcher/jni_bridge.cpp | 109 +
src/main/cuda/spoof-launcher/jni_bridge.h | 81 +
src/main/cuda/spoof/cellwise.cu | 54 +
src/main/cuda/spoof/functions.cuh | 86 +
src/main/java/org/apache/sysds/api/DMLScript.java | 33 +-
.../apache/sysds/conf/ConfigurationManager.java | 12 +-
src/main/java/org/apache/sysds/conf/DMLConfig.java | 5 +-
.../apache/sysds/hops/codegen/SpoofCompiler.java | 209 +-
.../apache/sysds/hops/codegen/SpoofFusedOp.java | 46 +-
.../org/apache/sysds/hops/codegen/cplan/CNode.java | 42 +-
.../sysds/hops/codegen/cplan/CNodeBinary.java | 175 +-
.../apache/sysds/hops/codegen/cplan/CNodeCell.java | 102 +-
.../apache/sysds/hops/codegen/cplan/CNodeData.java | 49 +-
.../sysds/hops/codegen/cplan/CNodeMultiAgg.java | 19 +-
.../apache/sysds/hops/codegen/cplan/CNodeNary.java | 22 +-
.../hops/codegen/cplan/CNodeOuterProduct.java | 17 +-
.../apache/sysds/hops/codegen/cplan/CNodeRow.java | 18 +-
.../sysds/hops/codegen/cplan/CNodeTernary.java | 62 +-
.../apache/sysds/hops/codegen/cplan/CNodeTpl.java | 12 +-
.../sysds/hops/codegen/cplan/CNodeUnary.java | 121 +-
.../sysds/hops/codegen/cplan/CodeTemplate.java} | 38 +-
.../sysds/hops/codegen/cplan/cpp/Binary.java | 322 ++
.../sysds/hops/codegen/cplan/cpp/CellWise.java | 76 +
.../sysds/hops/codegen/cplan/cpp/Ternary.java | 130 +
.../apache/sysds/hops/codegen/cplan/cpp/Unary.java | 258 ++
.../sysds/hops/codegen/cplan/java/Binary.java | 200 ++
.../sysds/hops/codegen/cplan/java/CellWise.java | 79 +
.../sysds/hops/codegen/cplan/java/Ternary.java | 89 +
.../sysds/hops/codegen/cplan/java/Unary.java | 152 +
.../java/org/apache/sysds/lops/SpoofFused.java | 30 +-
.../apache/sysds/runtime/codegen/CodegenUtils.java | 14 +-
.../apache/sysds/runtime/codegen/SpoofCUDA.java | 121 +
.../sysds/runtime/codegen/SpoofOperator.java | 9 +-
.../controlprogram/context/ExecutionContext.java | 26 +-
.../runtime/functionobjects/IntegerDivide.java | 4 +-
.../runtime/instructions/GPUInstructionParser.java | 10 +-
.../instructions/cp/SpoofCPInstruction.java | 4 +-
.../runtime/instructions/gpu/GPUInstruction.java | 7 +-
.../instructions/gpu/SpoofCUDAInstruction.java | 119 +
.../instructions/gpu/context/GPUContextPool.java | 6 +-
.../instructions/gpu/context/GPUObject.java | 19 +
.../instructions/gpu/context/JCudaKernels.java | 2 +-
.../instructions/spark/SpoofSPInstruction.java | 6 +-
.../sysds/runtime/matrix/data/LibMatrixNative.java | 2 +-
.../java/org/apache/sysds/utils/NativeHelper.java | 8 +-
.../test/functions/codegen/CellwiseTmplTest.java | 6 +-
src/test/resources/log4j.properties | 1 +
.../functions/codegen/SystemDS-config-codegen.xml | 2 +
73 files changed, 7451 insertions(+), 582 deletions(-)
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..8d14805
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "jitify"]
+ path = src/main/cuda/ext/jitify
+ url = git@github.com:NVIDIA/jitify.git
diff --git a/conf/SystemDS-config.xml.template b/conf/SystemDS-config.xml.template
index 7fa4d98..171cfb3 100644
--- a/conf/SystemDS-config.xml.template
+++ b/conf/SystemDS-config.xml.template
@@ -41,12 +41,15 @@
<!-- enables operator fusion via code generation, experimental feature -->
<sysds.codegen.enabled>false</sysds.codegen.enabled>
-
- <!-- set the codegen java compiler (auto, janino, javac) -->
+
+ <!-- set the codegen API (auto, java, cuda) -->
+ <sysds.codegen.api>auto</sysds.codegen.api>
+
+ <!-- set the codegen java compiler (auto, janino, javac, nvcc, nvrtc) -->
<sysds.codegen.compiler>auto</sysds.codegen.compiler>
<!-- set the codegen optimizer (fuse_all, fuse_no_redundancy, fuse_cost_based_v2) -->
- <sysds.codegen.compiler>fuse_cost_based_v2</sysds.codegen.compiler>
+ <sysds.codegen.optimizer>fuse_cost_based_v2</sysds.codegen.optimizer>
<!-- if codegen.enabled, enables source code caching of fused operators -->
<sysds.codegen.plancache>true</sysds.codegen.plancache>
diff --git a/pom.xml b/pom.xml
index 5f3da30..4027916 100644
--- a/pom.xml
+++ b/pom.xml
@@ -87,18 +87,31 @@
<targetPath>scripts</targetPath>
</resource>
<resource>
- <directory>src/main/cpp/kernels</directory>
- <excludes>
- <exclude>*.cu</exclude>
- <exclude>*.cuh</exclude>
- <exclude>Makefile</exclude>
- </excludes>
- <targetPath>kernels</targetPath>
+ <directory>src/main/cuda/kernels</directory>
+ <includes>
+ <include>SystemDS.ptx</include>
+ <include>reduction.ptx</include>
+ </includes>
+ <targetPath>cuda/kernels</targetPath>
</resource>
<resource>
<directory>src/main/cpp/lib</directory>
<targetPath>lib</targetPath>
</resource>
+ <resource>
+ <directory>src/main/cuda/spoof</directory>
+ <targetPath>cuda/spoof</targetPath>
+ </resource>
+ <resource>
+ <directory>src/main/cuda/headers</directory>
+ <includes>
+ <include>agg_ops.cuh</include>
+ <include>reduction.cuh</include>
+ <include>spoof_utils.cuh</include>
+ <include>utils.cuh</include>
+ </includes>
+ <targetPath>cuda/headers</targetPath>
+ </resource>
</resources>
<plugins>
@@ -531,6 +544,7 @@
<configuration>
<excludes>
<exclude>.gitignore</exclude>
+ <exclude>.gitmodules</exclude>
<exclude>.repository/</exclude>
<exclude>.idea/</exclude>
<exclude>.git</exclude>
@@ -567,7 +581,8 @@
<exclude>src/main/java/*.tokens</exclude>
<exclude>**/*.interp</exclude>
<!-- Compiled ptx file from nvcc -->
- <exclude>src/main/cpp/kernels/SystemDS.ptx</exclude>
+ <exclude>src/main/cuda/kernels/SystemDS.ptx</exclude>
+ <exclude>src/main/cuda/kernels/reduction.ptx</exclude>
<!-- Test Validation files -->
<exclude>src/test/scripts/functions/jmlc/**/*.impute</exclude>
<exclude>src/test/scripts/functions/jmlc/**/*.map</exclude>
@@ -586,6 +601,8 @@
<exclude>src/main/python/tests/lt*.txt</exclude>
<!-- Perftest requirement file -->
<exclude>scripts/perftest/python/requirements.txt</exclude>
+ <!-- external sources -->
+ <exclude>src/main/cuda/ext/**</exclude>
</excludes>
</configuration>
</plugin>
diff --git a/src/main/cpp/kernels/Makefile b/src/main/cpp/kernels/Makefile
deleted file mode 100644
index 8766114..0000000
--- a/src/main/cpp/kernels/Makefile
+++ /dev/null
@@ -1,40 +0,0 @@
-#-------------------------------------------------------------
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#-------------------------------------------------------------
-
-NVCC=nvcc
-GCC=gcc-8
-
-# set the CUDA-supported version of gcc through -ccbin
-CUDAFLAGS= -ptx -c -arch=sm_30 --std c++11 -ccbin $(GCC)
-
-# Flags for compilation on recent Ubuntu + P100
-#CUDAFLAGS= -ptx -c --std c++11 -ccbin gcc-8 -m64 -gencode arch=compute_60,code=sm_60
-
-# Use these flags for precise math
-#CUDAFLAGS= -ptx -c -arch=sm_30 -ftz=false -prec-div=true -prec-sqrt=true
-
-
-SystemDS.o: SystemDS.cu
- $(NVCC) $(CUDAFLAGS) SystemDS.cu
-
-all: SystemDS.o
-
-
-clean:
- rm -rf SystemDS.ptx
diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so
new file mode 100644
index 0000000..89f1270
Binary files /dev/null and b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so differ
diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll b/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll
new file mode 100644
index 0000000..22fe3d5
Binary files /dev/null and b/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll differ
diff --git a/src/main/cuda/CMakeLists.txt b/src/main/cuda/CMakeLists.txt
new file mode 100644
index 0000000..8b74dee
--- /dev/null
+++ b/src/main/cuda/CMakeLists.txt
@@ -0,0 +1,111 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+
+# default to gcc 8.x while we're still supporting CUDA 10.x only
+if (UNIX)
+ set(CMAKE_CUDA_HOST_COMPILER g++-8 CACHE INTERNAL "")
+ set(CMAKE_CUDA_COMPILER nvcc CACHE INTERNAL "")
+ set(CMAKE_CXX_COMPILER g++ CACHE INTERNAL "")
+endif()
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_WITH_INSTALL_RPATH True CACHE INTERNAL "")
+
+project(SystemDS LANGUAGES CXX CUDA)
+
+set(SYSDS_HEADERS
+ headers/agg_ops.cuh
+ headers/cum_max.cuh
+ headers/cum_min.cuh
+ headers/cum_prod.cuh
+ headers/cum_scan.cuh
+ headers/cum_sum.cuh
+ headers/cum_sum_prod.cuh
+ headers/utils.cuh)
+set(SYSDS_SOURCES kernels/SystemDS.cu)
+
+add_library(SystemDS OBJECT ${SYSDS_HEADERS} ${SYSDS_SOURCES})
+target_include_directories(SystemDS PUBLIC "${CMAKE_SOURCE_DIR}/headers")
+
+find_package(CUDAToolkit REQUIRED)
+cmake_policy(SET CMP0104 NEW)
+set(CMAKE_CUDA_ARCHITECTURES OFF)
+#ToDo: more compiler flag settings for Debug/Release compilation
+set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr")
+
+set_property(TARGET SystemDS PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
+set_property(TARGET SystemDS PROPERTY CUDA_PTX_COMPILATION ON)
+
+# sets the installation path to src/main/cuda
+if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+ set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}" CACHE PATH "sets the installation path to src/main/cpp/lib" FORCE)
+endif()
+
+install(FILES $<TARGET_OBJECTS:SystemDS> DESTINATION kernels)
+
+#-------------------------------------------------------------
+#project (spoof_cuda LANGUAGES CXX CUDA)
+
+add_library(reduction OBJECT kernels/reduction.cu headers/reduction.cuh)
+target_include_directories(reduction PUBLIC "${CMAKE_SOURCE_DIR}/headers")
+set_property(TARGET reduction PROPERTY CUDA_PTX_COMPILATION ON)
+install(FILES $<TARGET_OBJECTS:reduction> DESTINATION kernels)
+
+include_directories($ENV{JAVA_HOME}/include/)
+
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
+ include_directories($ENV{JAVA_HOME}/include/darwin)
+endif()
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+ include_directories($ENV{JAVA_HOME}/include/linux)
+endif()
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
+ include_directories($ENV{JAVA_HOME}/include/win32)
+endif()
+
+set(SPOOF_HEADERS
+ spoof-launcher/jni_bridge.h
+ spoof-launcher/SpoofCUDAContext.h)
+set(SPOOF_SOURCES
+ spoof-launcher/jni_bridge.cpp
+ spoof-launcher/SpoofCUDAContext.cpp)
+
+add_library(spoof_cuda SHARED ${SPOOF_HEADERS} ${SPOOF_SOURCES} )
+
+target_include_directories(spoof_cuda PRIVATE "${CMAKE_SOURCE_DIR}/ext/jitify")
+target_link_libraries(spoof_cuda CUDA::nvrtc CUDA::cuda_driver CUDA::cudart)
+target_compile_features(spoof_cuda PUBLIC cxx_std_11)
+set_target_properties(spoof_cuda PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(spoof_cuda PROPERTIES OUTPUT_NAME "systemds_spoof_cuda-${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
+
+# unify naming convention to libsystemds_...
+if (WIN32)
+ set(CMAKE_IMPORT_LIBRARY_PREFIX lib CACHE INTERNAL "")
+ set(CMAKE_SHARED_LIBRARY_PREFIX lib CACHE INTERNAL "")
+ target_link_libraries(spoof_cuda DbgHelp.lib)
+ install(TARGETS spoof_cuda RUNTIME DESTINATION ../cpp/lib)
+endif()
+
+if(UNIX)
+ install(TARGETS spoof_cuda LIBRARY DESTINATION ../cpp/lib)
+endif()
diff --git a/src/main/cuda/ext/jitify b/src/main/cuda/ext/jitify
new file mode 160000
index 0000000..3e96bcc
--- /dev/null
+++ b/src/main/cuda/ext/jitify
@@ -0,0 +1 @@
+Subproject commit 3e96bcceb9e42105f6a32315abb2af04585a55b0
diff --git a/src/main/cpp/kernels/agg_ops.cuh b/src/main/cuda/headers/agg_ops.cuh
similarity index 76%
rename from src/main/cpp/kernels/agg_ops.cuh
rename to src/main/cuda/headers/agg_ops.cuh
index c53ce04..abe9a2b 100644
--- a/src/main/cpp/kernels/agg_ops.cuh
+++ b/src/main/cuda/headers/agg_ops.cuh
@@ -17,19 +17,19 @@
* under the License.
*/
-#ifndef __AGG_OPS_H
-#define __AGG_OPS_H
-
#pragma once
+#ifndef AGG_OPS_H
+#define AGG_OPS_H
#include <cuda_runtime.h>
+#include <math_constants.h>
/**
* Functor op for assignment op. This is a dummy/identity op.
*/
template<typename T>
struct IdentityOp {
- __device__ __forceinline__ T operator()(T a) const {
+ __device__ __forceinline__ T operator()(T a, int idx = 0) const {
return a;
}
};
@@ -45,15 +45,39 @@ struct SumOp {
};
/**
+ * Functor op for sum of squares operation (returns a + b * b)
+ */
+template<typename T>
+struct SumSqOp {
+ __device__ __forceinline__ T operator()(T a, T b) const {
+ return a + b * b;
+ }
+};
+
+/**
* Functor op for min operation
*/
template<typename T>
struct MinOp {
__device__ __forceinline__ T operator()(T a, T b) const {
+ return a < b ? a : b;
+ }
+};
+
+template<>
+struct MinOp<double> {
+ __device__ __forceinline__ double operator()(double a, double b) const {
return fmin(a, b);
}
};
+template<>
+struct MinOp<float> {
+ __device__ __forceinline__ float operator()(float a, float b) const {
+ return fminf(a, b);
+ }
+};
+
/**
* Functor op for max operation
*/
@@ -124,10 +148,10 @@ struct MinNeutralElement {
};
template<>
-float MinNeutralElement<float>::get() { return INFINITY; }
+float MinNeutralElement<float>::get() { return CUDART_INF_F; }
template<>
-double MinNeutralElement<double>::get() { return INFINITY; }
+double MinNeutralElement<double>::get() { return CUDART_INF; }
template<typename T>
struct MaxNeutralElement {
@@ -135,9 +159,9 @@ struct MaxNeutralElement {
};
template<>
-float MaxNeutralElement<float>::get() { return -INFINITY; }
+float MaxNeutralElement<float>::get() { return -CUDART_INF_F; }
template<>
-double MaxNeutralElement<double>::get() { return -INFINITY; }
+double MaxNeutralElement<double>::get() { return -CUDART_INF_F; }
#endif // __AGG_OPS_H
diff --git a/src/main/cpp/kernels/cum_max.cuh b/src/main/cuda/headers/cum_max.cuh
similarity index 96%
rename from src/main/cpp/kernels/cum_max.cuh
rename to src/main/cuda/headers/cum_max.cuh
index 2571716..03cbb30 100644
--- a/src/main/cpp/kernels/cum_max.cuh
+++ b/src/main/cuda/headers/cum_max.cuh
@@ -17,10 +17,11 @@
* under the License.
*/
-#ifndef __CUM_MAX_H
-#define __CUM_MAX_H
-
#pragma once
+#ifndef CUM_MAX_H
+#define CUM_MAX_H
+
+using uint = unsigned int;
#include <cuda_runtime.h>
/**
@@ -75,4 +76,4 @@ extern "C" __global__ void cumulative_max_down_sweep_f(float *g_idata, float *g_
cumulative_scan_down_sweep<MaxOp<float>, MaxNeutralElement<float>, float>(g_idata, g_odata, g_tdata, rows, cols, block_height, op);
}
-#endif // __CUM_MAX_H
+#endif // CUM_MAX_H
diff --git a/src/main/cpp/kernels/cum_min.cuh b/src/main/cuda/headers/cum_min.cuh
similarity index 97%
rename from src/main/cpp/kernels/cum_min.cuh
rename to src/main/cuda/headers/cum_min.cuh
index 5ebe659..3e653ba 100644
--- a/src/main/cpp/kernels/cum_min.cuh
+++ b/src/main/cuda/headers/cum_min.cuh
@@ -17,10 +17,11 @@
* under the License.
*/
-#ifndef __CUM_MIN_H
-#define __CUM_MIN_H
-
#pragma once
+#ifndef CUM_MIN_H
+#define CUM_MIN_H
+
+using uint = unsigned int;
#include <cuda_runtime.h>
/**
diff --git a/src/main/cpp/kernels/cum_prod.cuh b/src/main/cuda/headers/cum_prod.cuh
similarity index 97%
rename from src/main/cpp/kernels/cum_prod.cuh
rename to src/main/cuda/headers/cum_prod.cuh
index f294fc2..f6fc2fe 100644
--- a/src/main/cpp/kernels/cum_prod.cuh
+++ b/src/main/cuda/headers/cum_prod.cuh
@@ -17,10 +17,9 @@
* under the License.
*/
-#ifndef __CUM_PROD_H
-#define __CUM_PROD_H
-
#pragma once
+#ifndef CUM_PROD_H
+#define CUM_PROD_H
using uint = unsigned int;
#include <cuda_runtime.h>
@@ -77,4 +76,4 @@ extern "C" __global__ void cumulative_prod_down_sweep_f(float *g_idata, float *g
cumulative_scan_down_sweep<ProductOp<float>, ProdNeutralElement<float>, float>(g_idata, g_odata, g_tdata, rows, cols, block_height, op);
}
-#endif // __CUM_PROD_H
+#endif // CUM_PROD_H
diff --git a/src/main/cpp/kernels/cum_scan.cuh b/src/main/cuda/headers/cum_scan.cuh
similarity index 96%
rename from src/main/cpp/kernels/cum_scan.cuh
rename to src/main/cuda/headers/cum_scan.cuh
index e73488d..67e53d6 100644
--- a/src/main/cpp/kernels/cum_scan.cuh
+++ b/src/main/cuda/headers/cum_scan.cuh
@@ -17,10 +17,12 @@
* under the License.
*/
-#ifndef __CUM_SCAN_H
-#define __CUM_SCAN_H
-
#pragma once
+#ifndef CUM_SCAN_H
+#define CUM_SCAN_H
+
+using uint = unsigned int;
+#include <cuda_runtime.h>
/**
* Cumulative Scan - Applies <scanOp> to accumulate values over columns of an input matrix.
@@ -84,4 +86,4 @@ __device__ void cumulative_scan_down_sweep(T *g_idata, T *g_odata, T *g_tdata, u
g_odata[i] = acc = scan_op(acc, g_idata[i]);
}
-#endif // __CUM_SCAN_H
+#endif // CUM_SCAN_H
diff --git a/src/main/cpp/kernels/cum_sum.cuh b/src/main/cuda/headers/cum_sum.cuh
similarity index 97%
rename from src/main/cpp/kernels/cum_sum.cuh
rename to src/main/cuda/headers/cum_sum.cuh
index c142d57..5325138 100644
--- a/src/main/cpp/kernels/cum_sum.cuh
+++ b/src/main/cuda/headers/cum_sum.cuh
@@ -17,10 +17,9 @@
* under the License.
*/
-#ifndef __CUM_SUM_H
-#define __CUM_SUM_H
-
#pragma once
+#ifndef CUM_SUM_H
+#define CUM_SUM_H
using uint = unsigned int;
#include <cuda_runtime.h>
@@ -75,4 +74,4 @@ extern "C" __global__ void cumulative_sum_down_sweep_f(float *g_idata, float *g_
cumulative_scan_down_sweep<SumOp<float>, SumNeutralElement<float>, float>(g_idata, g_odata, g_tdata, rows, cols, block_height, op);
}
-#endif // __CUM_SUM_H
+#endif // CUM_SUM_H
diff --git a/src/main/cpp/kernels/cum_sum_prod.cuh b/src/main/cuda/headers/cum_sum_prod.cuh
similarity index 98%
rename from src/main/cpp/kernels/cum_sum_prod.cuh
rename to src/main/cuda/headers/cum_sum_prod.cuh
index 969ed30..be228cd 100644
--- a/src/main/cpp/kernels/cum_sum_prod.cuh
+++ b/src/main/cuda/headers/cum_sum_prod.cuh
@@ -17,11 +17,11 @@
* under the License.
*/
-#ifndef __CUM_SUM_PROD_H
-#define __CUM_SUM_PROD_H
-
#pragma once
+#ifndef CUM_SUM_PROD_H
+#define CUM_SUM_PROD_H
+using uint = unsigned int;
#include <cuda_runtime.h>
/**
@@ -145,4 +145,4 @@ extern "C" __global__ void cumulative_sum_prod_f(float *g_idata, float *g_odata,
cumulative_sum_prod<float, float2Accessor>(g_idata, g_odata, g_tiData, g_toData, rows, block_height, offset);
}
-#endif // __CUM_SUM_PROD_H
+#endif // CUM_SUM_PROD_H
diff --git a/src/main/cuda/headers/reduction.cuh b/src/main/cuda/headers/reduction.cuh
new file mode 100644
index 0000000..56845b5
--- /dev/null
+++ b/src/main/cuda/headers/reduction.cuh
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+#ifndef REDUCTION_CUH
+#define REDUCTION_CUH
+
+using uint = unsigned int;
+#include <cuda_runtime.h>
+
+#include "utils.cuh"
+
+/**
+ * Does a reduce operation over all elements of the array.
+ * This method has been adapted from the Reduction sample in the NVIDIA CUDA
+ * Samples (v8.0)
+ * and the Reduction example available through jcuda.org
+ * When invoked initially, all blocks partly compute the reduction operation
+ * over the entire array
+ * and writes it to the output/temporary array. A second invokation needs to
+ * happen to get the
+ * reduced value.
+ * The number of threads, blocks and amount of shared memory is calculated in a
+ * specific way.
+ * Please refer to the NVIDIA CUDA Sample or the SystemDS code that invokes this
+ * method to see
+ * how its done.
+ * The template-ized version of this function is similar to what is found in
+ * NVIDIA CUB
+ *
+ * @param n size of the input and temporary/output arrays
+ * @param ReductionOp Type of the functor object that implements the
+ * reduction operation
+ * @param SpoofCellwiseOp initial value for the reduction variable
+ */
+template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
+__device__ void FULL_AGG(
+ T *g_idata, ///< input data stored in device memory (of size n)
+ T *g_odata, ///< output/temporary array stored in device memory (of size n)
+ uint m,
+ uint n,
+ T initialValue,
+ ReductionOp reduction_op,
+ SpoofCellwiseOp spoof_op)
+{
+ auto sdata = shared_memory_proxy<T>();
+
+ // perform first level of reduction,
+ // reading from global memory, writing to shared memory
+ uint tid = threadIdx.x;
+ uint i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
+ uint gridSize = blockDim.x * 2 * gridDim.x;
+ uint N = m * n;
+ T v = initialValue;
+
+ // we reduce multiple elements per thread. The number is determined by the
+ // number of active thread blocks (via gridDim). More blocks will result
+ // in a larger gridSize and therefore fewer elements per thread
+ while (i < N) {
+ v = reduction_op(v, spoof_op(g_idata[i], i));
+
+ if (i + blockDim.x < N)
+ {
+ //__syncthreads();
+ //printf("loop fetch i(%d)+blockDim.x(%d)=%d, in=%f\n",i, blockDim.x, i + blockDim.x, g_idata[i + blockDim.x]);
+ v = reduction_op(v, spoof_op(g_idata[i + blockDim.x], blockDim.x + i));
+ }
+
+ i += gridSize;
+ }
+
+ // each thread puts its local sum into shared memory
+ sdata[tid] = v;
+ __syncthreads();
+
+ // do reduction in shared mem
+ if (blockDim.x >= 1024) {
+ if (tid < 512) {
+ sdata[tid] = v = reduction_op(v, sdata[tid + 512]);
+ }
+ __syncthreads();
+ }
+ if (blockDim.x >= 512) {
+ if (tid < 256) {
+ sdata[tid] = v = reduction_op(v, sdata[tid + 256]);
+ }
+ __syncthreads();
+ }
+ if (blockDim.x >= 256) {
+ if (tid < 128) {
+ sdata[tid] = v = reduction_op(v, sdata[tid + 128]);
+ }
+ __syncthreads();
+ }
+ if (blockDim.x >= 128) {
+ if (tid < 64) {
+ sdata[tid] = v = reduction_op(v, sdata[tid + 64]);
+ }
+ __syncthreads();
+ }
+
+ if (tid < 32) {
+ // now that we are using warp-synchronous programming (below)
+ // we need to declare our shared memory volatile so that the compiler
+ // doesn't reorder stores to it and induce incorrect behavior.
+ volatile T *smem = sdata;
+ if (blockDim.x >= 64) {
+ smem[tid] = v = reduction_op(v, smem[tid + 32]);
+ }
+ if (blockDim.x >= 32) {
+ smem[tid] = v = reduction_op(v, smem[tid + 16]);
+ }
+ if (blockDim.x >= 16) {
+ smem[tid] = v = reduction_op(v, smem[tid + 8]);
+ }
+ if (blockDim.x >= 8) {
+ smem[tid] = v = reduction_op(v, smem[tid + 4]);
+ }
+ if (blockDim.x >= 4) {
+ smem[tid] = v = reduction_op(v, smem[tid + 2]);
+ }
+ if (blockDim.x >= 2) {
+ smem[tid] = v = reduction_op(v, smem[tid + 1]);
+ }
+ }
+
+ // write result for this block to global mem
+ if (tid == 0) {
+ if(gridDim.x < 10)
+ printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
+ g_odata[blockIdx.x] = sdata[0];
+ }
+}
+
+/**
+ * Does a reduce (sum) over each row of the array.
+ * This kernel must be launched with as many blocks as there are rows.
+ * The intuition for this kernel is that each block does a reduction over a
+ * single row.
+ * The maximum number of blocks that can launched (as of compute capability 3.0)
+ * is 2^31 - 1
+ * This works out fine for SystemDS, since the maximum elements in a Java array
+ * can be 2^31 - c (some small constant)
+ * If the matrix is "fat" and "short", i.e. there are small number of rows and a
+ * large number of columns,
+ * there could be under-utilization of the hardware.
+ * The template-ized version of this function is similar to what is found in
+ * NVIDIA CUB
+ * @param ReductionOp Type of the functor object that implements the
+ * reduction operation
+ * @param AssignmentOp Type of the functor object that is used to modify
+ * the value before writing it to its final location in global memory for each
+ * row
+ */
+template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
+__device__ void ROW_AGG(
+ T *g_idata, ///< input data stored in device memory (of size rows*cols)
+ T *g_odata, ///< output/temporary array store in device memory (of size
+ /// rows*cols)
+ uint rows, ///< rows in input and temporary/output arrays
+ uint cols, ///< columns in input and temporary/output arrays
+ T initialValue, ///< initial value for the reduction variable
+ ReductionOp reduction_op, ///< Reduction operation to perform (functor object)
+ SpoofCellwiseOp spoof_op) ///< Operation to perform before assigning this
+{
+ auto sdata = shared_memory_proxy<T>();
+
+ // one block per row
+ if (blockIdx.x >= rows) {
+ return;
+ }
+
+ uint block = blockIdx.x;
+ uint tid = threadIdx.x;
+ uint i = tid;
+ uint block_offset = block * cols;
+
+ T v = initialValue;
+ while (i < cols) {
+ v = reduction_op(v, spoof_op(g_idata[block_offset + i], i));
+ i += blockDim.x;
+ }
+
+ // each thread puts its local sum into shared memory
+ sdata[tid] = v;
+ __syncthreads();
+
+ // do reduction in shared mem
+ if (blockDim.x >= 1024) {
+ if (tid < 512) {
+ sdata[tid] = v = reduction_op(v, sdata[tid + 512]);
+ }
+ __syncthreads();
+ }
+ if (blockDim.x >= 512) {
+ if (tid < 256) {
+ sdata[tid] = v = reduction_op(v, sdata[tid + 256]);
+ }
+ __syncthreads();
+ }
+ if (blockDim.x >= 256) {
+ if (tid < 128) {
+ sdata[tid] = v = reduction_op(v, sdata[tid + 128]);
+ }
+ __syncthreads();
+ }
+ if (blockDim.x >= 128) {
+ if (tid < 64) {
+ sdata[tid] = v = reduction_op(v, sdata[tid + 64]);
+ }
+ __syncthreads();
+ }
+
+ if (tid < 32) {
+ // now that we are using warp-synchronous programming (below)
+ // we need to declare our shared memory volatile so that the compiler
+ // doesn't reorder stores to it and induce incorrect behavior.
+ volatile T *smem = sdata;
+ if (blockDim.x >= 64) {
+ smem[tid] = v = reduction_op(v, smem[tid + 32]);
+ }
+ if (blockDim.x >= 32) {
+ smem[tid] = v = reduction_op(v, smem[tid + 16]);
+ }
+ if (blockDim.x >= 16) {
+ smem[tid] = v = reduction_op(v, smem[tid + 8]);
+ }
+ if (blockDim.x >= 8) {
+ smem[tid] = v = reduction_op(v, smem[tid + 4]);
+ }
+ if (blockDim.x >= 4) {
+ smem[tid] = v = reduction_op(v, smem[tid + 2]);
+ }
+ if (blockDim.x >= 2) {
+ smem[tid] = v = reduction_op(v, smem[tid + 1]);
+ }
+ }
+
+ // write result for this block to global mem, modify it with assignment op
+ if (tid == 0)
+ g_odata[block] = sdata[0];
+}
+
+/**
+ * Does a column wise reduction.
+ * The intuition is that there are as many global threads as there are columns
+ * Each global thread is responsible for a single element in the output vector
+ * This of course leads to a under-utilization of the GPU resources.
+ * For cases, where the number of columns is small, there can be unused SMs
+ *
+ * The template-ized version of this function is similar to what is found in
+ * NVIDIA CUB
+ * @param ReductionOp Type of the functor object that implements the
+ * reduction operation
+ * @param AssignmentOp Type of the functor object that is used to modify
+ * the value before writing it to its final location in global memory for each
+ * column
+ */
+template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
+__device__ void COL_AGG(T *g_idata, ///< input data stored in device memory (of size rows*cols)
+ T *g_odata, ///< output/temporary array store in device memory (of size rows*cols)
+ uint rows, ///< rows in input and temporary/output arrays
+ uint cols, ///< columns in input and temporary/output arrays
+ T initialValue, ///< initial value for the reduction variable
+ ReductionOp reduction_op, ///< Reduction operation to perform (functor object)
+ SpoofCellwiseOp spoof_op) ///< Operation to perform before aggregation
+
+{
+ uint global_tid = blockIdx.x * blockDim.x + threadIdx.x;
+ if (global_tid >= cols) {
+ return;
+ }
+
+ uint i = global_tid;
+ uint grid_size = cols;
+ T val = initialValue;
+
+ while (i < rows * cols) {
+ val = reduction_op(val, spoof_op(g_idata[i], i));
+ i += grid_size;
+ }
+ g_odata[global_tid] = val;
+}
+
+template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
+__device__ void NO_AGG(T* g_idata, T* g_odata, uint rows, uint cols,
+ T VT, ReductionOp reduction_op, SpoofCellwiseOp spoof_op)
+{
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ int first_idx = tid * static_cast<int>(VT);
+ int last_idx = min(first_idx + static_cast<int>(VT), spoof_op.m * spoof_op.n);
+ #pragma unroll
+ for(int i = first_idx; i < last_idx; i++) {
+ g_odata[i] = spoof_op(g_idata[i], i);
+ }
+}
+
+#endif // REDUCTION_CUH
diff --git a/src/main/cuda/headers/spoof_utils.cuh b/src/main/cuda/headers/spoof_utils.cuh
new file mode 100644
index 0000000..e28d254
--- /dev/null
+++ b/src/main/cuda/headers/spoof_utils.cuh
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+#ifndef SPOOF_UTILS_CUH
+#define SPOOF_UTILS_CUH
+
+#include <math_constants.h>
+
+__constant__ double DOUBLE_EPS = 1.11022E-16; // 2 ^ -53
+__constant__ double FLOAT_EPS = 1.49012E-08; // 2 ^ -26
+__constant__ double EPSILON = 1E-11; // margin for comparisons ToDo: make consistent use of it
+
+__device__ long long toInt64(double a) {
+ return (signbit(a) == 0 ? 1.0 : -1.0) * abs(floor(a + DOUBLE_EPS));
+}
+
+__device__ int toInt32(float a) {
+ return (signbit(a) == 0 ? 1.0 : -1.0) * abs(floor(a + FLOAT_EPS));
+}
+
+template<typename T>
+__device__ T getValue(T* data, int rowIndex) {
+ return data[rowIndex];
+}
+
+template<typename T>
+__device__ T getValue(T* data, int n, int rowIndex, int colIndex) {
+ return data[rowIndex * n + colIndex];
+}
+
+template<typename T>
+__device__ T intDiv(T a, T b);
+
+template<>
+__device__ double intDiv(double a, double b) {
+ double ret = a / b;
+ return (isnan(ret) || isinf(ret)) ? ret : toInt64(ret);
+}
+
+template<>
+__device__ float intDiv(float a, float b) {
+ float ret = a / b;
+ return (isnan(ret) || isinf(ret)) ? ret : toInt32(ret);
+}
+
+template<typename T>
+__device__ T modulus(T a, T b);
+
+template<>
+__device__ double modulus(double a, double b) {
+ if (fabs(b) < DOUBLE_EPS)
+ return CUDART_NAN;
+ return a - intDiv(a, b) * b;
+}
+
+template<>
+__device__ float modulus(float a, float b) {
+ if (fabs(b) < FLOAT_EPS)
+ return CUDART_NAN_F;
+ return a - intDiv(a, b) * b;
+}
+
+template<typename T>
+__device__ T bwAnd(T a, T b);
+
+// ToDo: does not work with long long
+template<>
+__device__ double bwAnd(double a, double b) {
+ return toInt64(a) & toInt64(b);
+}
+
+template<>
+__device__ float bwAnd(float a, float b) {
+ return toInt32(a) & toInt32(b);
+}
+
+#endif // SPOOF_UTILS_CUH
diff --git a/src/main/cpp/kernels/utils.cuh b/src/main/cuda/headers/utils.cuh
similarity index 87%
rename from src/main/cpp/kernels/utils.cuh
rename to src/main/cuda/headers/utils.cuh
index e4ec01d..420c0d0 100644
--- a/src/main/cpp/kernels/utils.cuh
+++ b/src/main/cuda/headers/utils.cuh
@@ -17,12 +17,25 @@
* under the License.
*/
-#ifndef __UTILS_H
-#define __UTILS_H
-
#pragma once
+#ifndef UTILS_H
+#define UTILS_H
+
+#include <limits>
-#include <cuda_runtime.h>
+// Use this method in templates to fetch the maximum value for a given datatype
+template<typename T>
+__forceinline__ __device__ T MAX() {
+ return T();
+}
+template<>
+__forceinline__ __device__ float MAX<float>() {
+ return std::numeric_limits<float>::max();
+}
+template<>
+__forceinline__ __device__ double MAX<double>() {
+ return std::numeric_limits<double>::max();
+}
/**
* Solution suggested by [1] to have different types of shared memory
@@ -109,4 +122,4 @@ extern "C" __global__ void float2double_f(float *A, double *ret, int N) {
}
}
-#endif // __UTILS_H
+#endif // UTILS_H
diff --git a/src/main/cpp/kernels/SystemDS.cu b/src/main/cuda/kernels/SystemDS.cu
similarity index 99%
rename from src/main/cpp/kernels/SystemDS.cu
rename to src/main/cuda/kernels/SystemDS.cu
index ccf880b..52e2b33 100644
--- a/src/main/cpp/kernels/SystemDS.cu
+++ b/src/main/cuda/kernels/SystemDS.cu
@@ -23,11 +23,8 @@
nvcc -w -ptx -arch=sm_30 --std c++11 SystemDS.cu
***********************************/
-#include <cfloat>
-#include <cmath>
using uint = unsigned int;
#include <cuda_runtime.h>
-#include <device_launch_parameters.h>
#include "utils.cuh"
#include "agg_ops.cuh"
@@ -424,20 +421,6 @@ extern "C" __global__ void copy_u2l_dense_f(float *ret, int dim, int N) {
copy_u2l_dense(ret, dim, N);
}
-// Use this method in templates to fetch the maximum value for a given datatype
-template<typename T>
-__forceinline__ __device__ T MAX() {
- return T();
-}
-template<>
-__forceinline__ __device__ float MAX<float>() {
- return FLT_MAX;
-}
-template<>
-__forceinline__ __device__ double MAX<double>() {
- return DBL_MAX;
-}
-
// op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power,
// 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
// 11=min, 12=max, 13=and, 14=or, 15=minus1multiply, 16=minusnz,
diff --git a/src/main/cpp/kernels/SystemDS.ptx b/src/main/cuda/kernels/SystemDS.ptx
similarity index 100%
rename from src/main/cpp/kernels/SystemDS.ptx
rename to src/main/cuda/kernels/SystemDS.ptx
diff --git a/src/main/cuda/kernels/reduction.cu b/src/main/cuda/kernels/reduction.cu
new file mode 100644
index 0000000..04fd098
--- /dev/null
+++ b/src/main/cuda/kernels/reduction.cu
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "utils.cuh"
+#include "agg_ops.cuh"
+#include "reduction.cuh"
+
+using uint = unsigned int;
+#include <cuda_runtime.h>
+
+/**
+ * Do a summation over all elements of an array/matrix
+ * @param g_idata input data stored in device memory (of size n)
+ * @param g_odata output/temporary array stored in device memory (of size n)
+ * @param n size of the input and temporary/output arrays
+ */
+template<typename T>
+__device__ void reduce_sum(T *g_idata, T *g_odata, uint n) {
+ SumOp<T> agg_op;
+ IdentityOp<T> spoof_op;
+ FULL_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, (T) 0.0, agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_sum_d(double *g_idata, double *g_odata, uint n) {
+ reduce_sum(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_sum_f(float *g_idata, float *g_odata, uint n) {
+ reduce_sum(g_idata, g_odata, n);
+}
+
+/**
+ * Do a summation over all rows of a matrix
+ * @param g_idata input matrix stored in device memory (of size rows * cols)
+ * @param g_odata output vector stored in device memory (of size rows)
+ * @param rows number of rows in input matrix
+ * @param cols number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_row_sum(T *g_idata, T *g_odata, uint rows, uint cols) {
+ SumOp<T> agg_op;
+ IdentityOp<T> spoof_op;
+ ROW_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, 0.0, agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_row_sum_d(double *g_idata, double *g_odata, uint rows, uint cols) {
+ reduce_row_sum(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_sum_f(float *g_idata, float *g_odata, uint rows, uint cols) {
+ reduce_row_sum(g_idata, g_odata, rows, cols);
+}
+
+/**
+ * Do a summation over all columns of a matrix
+ * @param g_idata input matrix stored in device memory (of size rows * cols)
+ * @param g_odata output vector stored in device memory (of size cols)
+ * @param rows number of rows in input matrix
+ * @param cols number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_col_sum(T *g_idata, T *g_odata, uint rows, uint cols) {
+ SumOp<T> agg_op;
+ IdentityOp<T> spoof_op;
+ COL_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_col_sum_d(double *g_idata, double *g_odata, uint rows, uint cols) {
+ reduce_col_sum(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_sum_f(float *g_idata, float *g_odata, uint rows, uint cols) {
+ reduce_col_sum(g_idata, g_odata, rows, cols);
+}
+
+
+/**
+ * Do a max over all elements of an array/matrix
+ * @param g_idata input data stored in device memory (of size n)
+ * @param g_odata output/temporary array stode in device memory (of size n)
+ * @param n size of the input and temporary/output arrays
+ */
+template<typename T>
+__device__ void reduce_max(T *g_idata, T *g_odata, uint n) {
+ MaxOp<T> agg_op;
+ IdentityOp<T> spoof_op;
+ FULL_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, -MAX<T>(), agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_max_d(double *g_idata, double *g_odata, uint n) {
+ reduce_max(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_max_f(float *g_idata, float *g_odata, uint n) {
+ reduce_max(g_idata, g_odata, n);
+}
+
+/**
+ * Do a max over all rows of a matrix
+ * @param g_idata input matrix stored in device memory (of size rows * cols)
+ * @param g_odata output vector stored in device memory (of size rows)
+ * @param rows number of rows in input matrix
+ * @param cols number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_row_max(T *g_idata, T *g_odata, uint rows, uint cols) {
+ MaxOp<T> agg_op;
+ IdentityOp<T> spoof_op;
+ ROW_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, -MAX<T>(), agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_row_max_d(double *g_idata, double *g_odata, uint rows, uint cols) {
+ reduce_row_max(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_max_f(float *g_idata, float *g_odata, uint rows, uint cols) {
+ reduce_row_max(g_idata, g_odata, rows, cols);
+}
+
+/**
+ * Do a max over all columns of a matrix
+ * @param g_idata input matrix stored in device memory (of size rows * cols)
+ * @param g_odata output vector stored in device memory (of size cols)
+ * @param rows number of rows in input matrix
+ * @param cols number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_col_max(T *g_idata, T *g_odata, uint rows, uint cols) {
+ MaxOp<T> agg_op;
+ IdentityOp<T> spoof_op;
+ COL_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, -MAX<T>(), agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_col_max_d(double *g_idata, double *g_odata, uint rows, uint cols) {
+ reduce_col_max(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_max_f(float *g_idata, float *g_odata, uint rows, uint cols) {
+ reduce_col_max(g_idata, g_odata, rows, cols);
+}
+
+
+/**
+ * Do a min over all elements of an array/matrix
+ * @param g_idata input data stored in device memory (of size n)
+ * @param g_odata output/temporary array stode in device memory (of size n)
+ * @param n size of the input and temporary/output arrays
+ */
+template<typename T>
+__device__ void reduce_min(T *g_idata, T *g_odata, uint n) {
+ MinOp<T> agg_op;
+ IdentityOp<T> spoof_op;
+ FULL_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, MAX<T>(), agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_min_d(double *g_idata, double *g_odata, uint n) {
+ reduce_min(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_min_f(float *g_idata, float *g_odata, uint n) {
+ reduce_min(g_idata, g_odata, n);
+}
+
+
+/**
+ * Do a min over all rows of a matrix
+ * @param g_idata input matrix stored in device memory (of size rows * cols)
+ * @param g_odata output vector stored in device memory (of size rows)
+ * @param rows number of rows in input matrix
+ * @param cols number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_row_min(T *g_idata, T *g_odata, uint rows, uint cols) {
+ MinOp<T> agg_op;
+ IdentityOp<T> spoof_op;
+ ROW_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, MAX<T>(), agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_row_min_d(double *g_idata, double *g_odata, uint rows, uint cols) {
+ reduce_row_min(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_min_f(float *g_idata, float *g_odata, uint rows, uint cols) {
+ reduce_row_min(g_idata, g_odata, rows, cols);
+}
+
+/**
+ * Do a min over all columns of a matrix
+ * @param g_idata input matrix stored in device memory (of size rows * cols)
+ * @param g_odata output vector stored in device memory (of size cols)
+ * @param rows number of rows in input matrix
+ * @param cols number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_col_min(T *g_idata, T *g_odata, uint rows, uint cols) {
+ MinOp<T> agg_op;
+ IdentityOp<T> spoof_op;
+ COL_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, MAX<T>(), agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_col_min_d(double *g_idata, double *g_odata, uint rows, uint cols) {
+ reduce_col_min(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_min_f(float *g_idata, float *g_odata, uint rows, uint cols) {
+ reduce_col_min(g_idata, g_odata, rows, cols);
+}
+
+
+/**
+ * Do a summation over all squared elements of an array/matrix
+ * @param g_idata input data stored in device memory (of size n)
+ * @param g_odata output/temporary array stored in device memory (of size n)
+ * @param n size of the input and temporary/output arrays
+ */
+template<typename T>
+__device__ void reduce_sum_sq(T *g_idata, T *g_odata, uint n) {
+ SumSqOp<T> agg_op;
+ IdentityOp<T> spoof_op;
+ FULL_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, (T) 0.0, agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_sum_sq_d(double *g_idata, double *g_odata, uint n) {
+ reduce_sum_sq(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_sum_sq_f(float *g_idata, float *g_odata, uint n) {
+ reduce_sum_sq(g_idata, g_odata, n);
+}
+
+/**
+ * Do a summation over all squared elements of an array/matrix
+ * @param g_idata input data stored in device memory (of size n)
+ * @param g_odata output/temporary array stored in device memory (of size n)
+ * @param rows number of rows in input matrix
+ * @param cols number of columns in input matrix
+ */
+template<typename T>
+__device__ void reduce_col_sum_sq(T* g_idata, T* g_odata, uint rows, uint cols) {
+ SumSqOp<T> agg_op;
+ IdentityOp<T> spoof_op;
+ COL_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_col_sum_sq_d(double* g_idata, double* g_odata, uint rows, uint cols) {
+ reduce_col_sum_sq(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_sum_sq_f(float* g_idata, float* g_odata, uint rows, uint cols) {
+ reduce_col_sum_sq(g_idata, g_odata, rows, cols);
+}
+
+template<typename T>
+__device__ void reduce_row_sum_sq(T* g_idata, T* g_odata, uint rows, uint cols) {
+ SumSqOp<T> agg_op;
+ IdentityOp<T> spoof_op;
+ ROW_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op);
+}
+
+extern "C" __global__ void reduce_row_sum_sq_d(double* g_idata, double* g_odata, uint rows, uint cols) {
+ reduce_row_sum_sq(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_sum_sq_f(float* g_idata, float* g_odata, uint rows, uint cols) {
+ reduce_row_sum_sq(g_idata, g_odata, rows, cols);
+}
diff --git a/src/main/cuda/kernels/reduction.ptx b/src/main/cuda/kernels/reduction.ptx
new file mode 100644
index 0000000..4a30447
--- /dev/null
+++ b/src/main/cuda/kernels/reduction.ptx
@@ -0,0 +1,3546 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-27506705
+// Cuda compilation tools, release 10.2, V10.2.89
+// Based on LLVM 3.4svn
+//
+
+.version 6.5
+.target sm_30
+.address_size 64
+
+ // .globl double2float_f
+.extern .func (.param .b32 func_retval0) vprintf
+(
+ .param .b64 vprintf_param_0,
+ .param .b64 vprintf_param_1
+)
+;
+.global .align 1 .b8 $str[39] = {98, 108, 111, 99, 107, 73, 100, 120, 46, 120, 61, 37, 100, 32, 114, 101, 100, 117, 99, 116, 105, 111, 110, 32, 114, 101, 115, 117, 108, 116, 58, 32, 37, 51, 46, 49, 102, 10, 0};
+.extern .shared .align 1 .b8 memory[];
+
+.visible .entry double2float_f(
+ .param .u64 double2float_f_param_0,
+ .param .u64 double2float_f_param_1,
+ .param .u32 double2float_f_param_2
+)
+{
+ .reg .pred %p<2>;
+ .reg .f32 %f<2>;
+ .reg .b32 %r<6>;
+ .reg .f64 %fd<2>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [double2float_f_param_0];
+ ld.param.u64 %rd2, [double2float_f_param_1];
+ ld.param.u32 %r2, [double2float_f_param_2];
+ mov.u32 %r3, %ctaid.x;
+ mov.u32 %r4, %ntid.x;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r1, %r4, %r3, %r5;
+ setp.ge.s32 %p1, %r1, %r2;
+ @%p1 bra BB0_2;
+
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.s32 %rd4, %r1, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd1, [%rd5];
+ cvt.rn.f32.f64 %f1, %fd1;
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.s32 %rd7, %r1, 4;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f32 [%rd8], %f1;
+
+BB0_2:
+ ret;
+}
+
+ // .globl float2double_f
+.visible .entry float2double_f(
+ .param .u64 float2double_f_param_0,
+ .param .u64 float2double_f_param_1,
+ .param .u32 float2double_f_param_2
+)
+{
+ .reg .pred %p<2>;
+ .reg .f32 %f<2>;
+ .reg .b32 %r<6>;
+ .reg .f64 %fd<2>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [float2double_f_param_0];
+ ld.param.u64 %rd2, [float2double_f_param_1];
+ ld.param.u32 %r2, [float2double_f_param_2];
+ mov.u32 %r3, %ctaid.x;
+ mov.u32 %r4, %ntid.x;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r1, %r4, %r3, %r5;
+ setp.ge.s32 %p1, %r1, %r2;
+ @%p1 bra BB1_2;
+
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.s32 %rd4, %r1, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f32 %f1, [%rd5];
+ cvt.f64.f32 %fd1, %f1;
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.s32 %rd7, %r1, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f64 [%rd8], %fd1;
+
+BB1_2:
+ ret;
+}
+
+ // .globl reduce_sum_d
+.visible .entry reduce_sum_d(
+ .param .u64 reduce_sum_d_param_0,
+ .param .u64 reduce_sum_d_param_1,
+ .param .u32 reduce_sum_d_param_2
+)
+{
+ .local .align 16 .b8 __local_depot2[16];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<21>;
+ .reg .b32 %r<39>;
+ .reg .f64 %fd<61>;
+ .reg .b64 %rd<16>;
+
+
+ mov.u64 %SPL, __local_depot2;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd1, [reduce_sum_d_param_0];
+ ld.param.u64 %rd2, [reduce_sum_d_param_1];
+ ld.param.u32 %r6, [reduce_sum_d_param_2];
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mov.u32 %r10, %tid.x;
+ mad.lo.s32 %r38, %r8, %r9, %r10;
+ mov.f64 %fd45, 0d0000000000000000;
+ setp.ge.u32 %p1, %r38, %r6;
+ @%p1 bra BB2_4;
+
+BB2_1:
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.u32 %rd4, %r38, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd30, [%rd5];
+ add.f64 %fd45, %fd45, %fd30;
+ add.s32 %r3, %r38, %r9;
+ setp.ge.u32 %p2, %r3, %r6;
+ @%p2 bra BB2_3;
+
+ mul.wide.u32 %rd7, %r3, 8;
+ add.s64 %rd8, %rd3, %rd7;
+ ld.global.f64 %fd31, [%rd8];
+ add.f64 %fd45, %fd45, %fd31;
+
+BB2_3:
+ shl.b32 %r13, %r9, 1;
+ mov.u32 %r14, %nctaid.x;
+ mad.lo.s32 %r38, %r13, %r14, %r38;
+ setp.lt.u32 %p3, %r38, %r6;
+ @%p3 bra BB2_1;
+
+BB2_4:
+ shl.b32 %r16, %r10, 3;
+ mov.u32 %r17, memory;
+ add.s32 %r5, %r17, %r16;
+ st.shared.f64 [%r5], %fd45;
+ bar.sync 0;
+ setp.lt.u32 %p4, %r9, 1024;
+ @%p4 bra BB2_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB2_7;
+
+ ld.shared.f64 %fd32, [%r5+4096];
+ add.f64 %fd45, %fd45, %fd32;
+ st.shared.f64 [%r5], %fd45;
+
+BB2_7:
+ bar.sync 0;
+
+BB2_8:
+ setp.lt.u32 %p6, %r9, 512;
+ @%p6 bra BB2_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB2_11;
+
+ ld.shared.f64 %fd33, [%r5+2048];
+ add.f64 %fd45, %fd45, %fd33;
+ st.shared.f64 [%r5], %fd45;
+
+BB2_11:
+ bar.sync 0;
+
+BB2_12:
+ setp.lt.u32 %p8, %r9, 256;
+ @%p8 bra BB2_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB2_15;
+
+ ld.shared.f64 %fd34, [%r5+1024];
+ add.f64 %fd45, %fd45, %fd34;
+ st.shared.f64 [%r5], %fd45;
+
+BB2_15:
+ bar.sync 0;
+
+BB2_16:
+ setp.lt.u32 %p10, %r9, 128;
+ @%p10 bra BB2_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB2_19;
+
+ ld.shared.f64 %fd35, [%r5+512];
+ add.f64 %fd45, %fd45, %fd35;
+ st.shared.f64 [%r5], %fd45;
+
+BB2_19:
+ bar.sync 0;
+
+BB2_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB2_33;
+
+ setp.lt.u32 %p13, %r9, 64;
+ @%p13 bra BB2_23;
+
+ ld.volatile.shared.f64 %fd36, [%r5+256];
+ add.f64 %fd45, %fd45, %fd36;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB2_23:
+ setp.lt.u32 %p14, %r9, 32;
+ @%p14 bra BB2_25;
+
+ ld.volatile.shared.f64 %fd37, [%r5+128];
+ add.f64 %fd45, %fd45, %fd37;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB2_25:
+ setp.lt.u32 %p15, %r9, 16;
+ @%p15 bra BB2_27;
+
+ ld.volatile.shared.f64 %fd38, [%r5+64];
+ add.f64 %fd45, %fd45, %fd38;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB2_27:
+ setp.lt.u32 %p16, %r9, 8;
+ @%p16 bra BB2_29;
+
+ ld.volatile.shared.f64 %fd39, [%r5+32];
+ add.f64 %fd45, %fd45, %fd39;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB2_29:
+ setp.lt.u32 %p17, %r9, 4;
+ @%p17 bra BB2_31;
+
+ ld.volatile.shared.f64 %fd40, [%r5+16];
+ add.f64 %fd45, %fd45, %fd40;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB2_31:
+ setp.lt.u32 %p18, %r9, 2;
+ @%p18 bra BB2_33;
+
+ ld.volatile.shared.f64 %fd41, [%r5+8];
+ add.f64 %fd42, %fd45, %fd41;
+ st.volatile.shared.f64 [%r5], %fd42;
+
+BB2_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB2_37;
+
+ mov.u32 %r34, %nctaid.x;
+ setp.gt.u32 %p20, %r34, 9;
+ @%p20 bra BB2_36;
+
+ ld.shared.f64 %fd43, [memory];
+ add.u64 %rd9, %SP, 0;
+ add.u64 %rd10, %SPL, 0;
+ st.local.u32 [%rd10], %r7;
+ st.local.f64 [%rd10+8], %fd43;
+ mov.u64 %rd11, $str;
+ cvta.global.u64 %rd12, %rd11;
+ // Callseq Start 0
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd12;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd9;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r36, [retval0+0];
+
+ //{
+ }// Callseq End 0
+
+BB2_36:
+ ld.shared.f64 %fd44, [memory];
+ cvta.to.global.u64 %rd13, %rd2;
+ mul.wide.u32 %rd14, %r7, 8;
+ add.s64 %rd15, %rd13, %rd14;
+ st.global.f64 [%rd15], %fd44;
+
+BB2_37:
+ ret;
+}
+
+ // .globl reduce_sum_f
+.visible .entry reduce_sum_f(
+ .param .u64 reduce_sum_f_param_0,
+ .param .u64 reduce_sum_f_param_1,
+ .param .u32 reduce_sum_f_param_2
+)
+{
+ .local .align 16 .b8 __local_depot3[16];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<21>;
+ .reg .f32 %f<61>;
+ .reg .b32 %r<39>;
+ .reg .f64 %fd<2>;
+ .reg .b64 %rd<16>;
+
+
+ mov.u64 %SPL, __local_depot3;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd1, [reduce_sum_f_param_0];
+ ld.param.u64 %rd2, [reduce_sum_f_param_1];
+ ld.param.u32 %r6, [reduce_sum_f_param_2];
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mov.u32 %r10, %tid.x;
+ mad.lo.s32 %r38, %r8, %r9, %r10;
+ mov.f32 %f45, 0f00000000;
+ setp.ge.u32 %p1, %r38, %r6;
+ @%p1 bra BB3_4;
+
+BB3_1:
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.u32 %rd4, %r38, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f32 %f30, [%rd5];
+ add.f32 %f45, %f45, %f30;
+ add.s32 %r3, %r38, %r9;
+ setp.ge.u32 %p2, %r3, %r6;
+ @%p2 bra BB3_3;
+
+ mul.wide.u32 %rd7, %r3, 4;
+ add.s64 %rd8, %rd3, %rd7;
+ ld.global.f32 %f31, [%rd8];
+ add.f32 %f45, %f45, %f31;
+
+BB3_3:
+ shl.b32 %r13, %r9, 1;
+ mov.u32 %r14, %nctaid.x;
+ mad.lo.s32 %r38, %r13, %r14, %r38;
+ setp.lt.u32 %p3, %r38, %r6;
+ @%p3 bra BB3_1;
+
+BB3_4:
+ shl.b32 %r16, %r10, 2;
+ mov.u32 %r17, memory;
+ add.s32 %r5, %r17, %r16;
+ st.shared.f32 [%r5], %f45;
+ bar.sync 0;
+ setp.lt.u32 %p4, %r9, 1024;
+ @%p4 bra BB3_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB3_7;
+
+ ld.shared.f32 %f32, [%r5+2048];
+ add.f32 %f45, %f45, %f32;
+ st.shared.f32 [%r5], %f45;
+
+BB3_7:
+ bar.sync 0;
+
+BB3_8:
+ setp.lt.u32 %p6, %r9, 512;
+ @%p6 bra BB3_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB3_11;
+
+ ld.shared.f32 %f33, [%r5+1024];
+ add.f32 %f45, %f45, %f33;
+ st.shared.f32 [%r5], %f45;
+
+BB3_11:
+ bar.sync 0;
+
+BB3_12:
+ setp.lt.u32 %p8, %r9, 256;
+ @%p8 bra BB3_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB3_15;
+
+ ld.shared.f32 %f34, [%r5+512];
+ add.f32 %f45, %f45, %f34;
+ st.shared.f32 [%r5], %f45;
+
+BB3_15:
+ bar.sync 0;
+
+BB3_16:
+ setp.lt.u32 %p10, %r9, 128;
+ @%p10 bra BB3_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB3_19;
+
+ ld.shared.f32 %f35, [%r5+256];
+ add.f32 %f45, %f45, %f35;
+ st.shared.f32 [%r5], %f45;
+
+BB3_19:
+ bar.sync 0;
+
+BB3_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB3_33;
+
+ setp.lt.u32 %p13, %r9, 64;
+ @%p13 bra BB3_23;
+
+ ld.volatile.shared.f32 %f36, [%r5+128];
+ add.f32 %f45, %f45, %f36;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB3_23:
+ setp.lt.u32 %p14, %r9, 32;
+ @%p14 bra BB3_25;
+
+ ld.volatile.shared.f32 %f37, [%r5+64];
+ add.f32 %f45, %f45, %f37;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB3_25:
+ setp.lt.u32 %p15, %r9, 16;
+ @%p15 bra BB3_27;
+
+ ld.volatile.shared.f32 %f38, [%r5+32];
+ add.f32 %f45, %f45, %f38;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB3_27:
+ setp.lt.u32 %p16, %r9, 8;
+ @%p16 bra BB3_29;
+
+ ld.volatile.shared.f32 %f39, [%r5+16];
+ add.f32 %f45, %f45, %f39;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB3_29:
+ setp.lt.u32 %p17, %r9, 4;
+ @%p17 bra BB3_31;
+
+ ld.volatile.shared.f32 %f40, [%r5+8];
+ add.f32 %f45, %f45, %f40;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB3_31:
+ setp.lt.u32 %p18, %r9, 2;
+ @%p18 bra BB3_33;
+
+ ld.volatile.shared.f32 %f41, [%r5+4];
+ add.f32 %f42, %f45, %f41;
+ st.volatile.shared.f32 [%r5], %f42;
+
+BB3_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB3_37;
+
+ mov.u32 %r34, %nctaid.x;
+ setp.gt.u32 %p20, %r34, 9;
+ @%p20 bra BB3_36;
+
+ ld.shared.f32 %f43, [memory];
+ cvt.f64.f32 %fd1, %f43;
+ add.u64 %rd9, %SP, 0;
+ add.u64 %rd10, %SPL, 0;
+ st.local.u32 [%rd10], %r7;
+ st.local.f64 [%rd10+8], %fd1;
+ mov.u64 %rd11, $str;
+ cvta.global.u64 %rd12, %rd11;
+ // Callseq Start 1
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd12;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd9;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r36, [retval0+0];
+
+ //{
+ }// Callseq End 1
+
+BB3_36:
+ ld.shared.f32 %f44, [memory];
+ cvta.to.global.u64 %rd13, %rd2;
+ mul.wide.u32 %rd14, %r7, 4;
+ add.s64 %rd15, %rd13, %rd14;
+ st.global.f32 [%rd15], %f44;
+
+BB3_37:
+ ret;
+}
+
+ // .globl reduce_row_sum_d
+.visible .entry reduce_row_sum_d(
+ .param .u64 reduce_row_sum_d_param_0,
+ .param .u64 reduce_row_sum_d_param_1,
+ .param .u32 reduce_row_sum_d_param_2,
+ .param .u32 reduce_row_sum_d_param_3
+)
+{
+ .reg .pred %p<20>;
+ .reg .b32 %r<72>;
+ .reg .f64 %fd<56>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [reduce_row_sum_d_param_0];
+ ld.param.u64 %rd2, [reduce_row_sum_d_param_1];
+ ld.param.u32 %r5, [reduce_row_sum_d_param_2];
+ ld.param.u32 %r4, [reduce_row_sum_d_param_3];
+ mov.u32 %r6, %ctaid.x;
+ setp.ge.u32 %p1, %r6, %r5;
+ @%p1 bra BB4_35;
+
+ mov.u32 %r71, %tid.x;
+ mov.f64 %fd6, 0d0000000000000000;
+ setp.ge.u32 %p2, %r71, %r4;
+ @%p2 bra BB4_4;
+
+ cvta.to.global.u64 %rd3, %rd1;
+
+BB4_3:
+ mad.lo.s32 %r8, %r6, %r4, %r71;
+ mul.wide.u32 %rd4, %r8, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd28, [%rd5];
+ add.f64 %fd6, %fd6, %fd28;
+ mov.u32 %r9, %ntid.x;
+ add.s32 %r71, %r9, %r71;
+ setp.lt.u32 %p3, %r71, %r4;
+ @%p3 bra BB4_3;
+
+BB4_4:
+ mov.u32 %r10, %tid.x;
+ shl.b32 %r11, %r10, 3;
+ mov.u32 %r12, memory;
+ add.s32 %r13, %r12, %r11;
+ st.shared.f64 [%r13], %fd6;
+ bar.sync 0;
+ mov.u32 %r14, %ntid.x;
+ setp.lt.u32 %p4, %r14, 1024;
+ @%p4 bra BB4_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB4_7;
+
+ ld.shared.f64 %fd29, [%r13+4096];
+ add.f64 %fd6, %fd6, %fd29;
+ st.shared.f64 [%r13], %fd6;
+
+BB4_7:
+ bar.sync 0;
+
+BB4_8:
+ setp.lt.u32 %p6, %r14, 512;
+ @%p6 bra BB4_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB4_11;
+
+ ld.shared.f64 %fd30, [%r13+2048];
+ add.f64 %fd6, %fd6, %fd30;
+ st.shared.f64 [%r13], %fd6;
+
+BB4_11:
+ bar.sync 0;
+
+BB4_12:
+ setp.lt.u32 %p8, %r14, 256;
+ @%p8 bra BB4_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB4_15;
+
+ ld.shared.f64 %fd31, [%r13+1024];
+ add.f64 %fd6, %fd6, %fd31;
+ st.shared.f64 [%r13], %fd6;
+
+BB4_15:
+ bar.sync 0;
+
+BB4_16:
+ setp.lt.u32 %p10, %r14, 128;
+ @%p10 bra BB4_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB4_19;
+
+ ld.shared.f64 %fd32, [%r13+512];
+ add.f64 %fd6, %fd6, %fd32;
+ st.shared.f64 [%r13], %fd6;
+
+BB4_19:
+ bar.sync 0;
+
+BB4_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB4_33;
+
+ setp.lt.u32 %p13, %r14, 64;
+ @%p13 bra BB4_23;
+
+ ld.volatile.shared.f64 %fd33, [%r13+256];
+ add.f64 %fd6, %fd6, %fd33;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB4_23:
+ setp.lt.u32 %p14, %r14, 32;
+ @%p14 bra BB4_25;
+
+ ld.volatile.shared.f64 %fd34, [%r13+128];
+ add.f64 %fd6, %fd6, %fd34;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB4_25:
+ setp.lt.u32 %p15, %r14, 16;
+ @%p15 bra BB4_27;
+
+ ld.volatile.shared.f64 %fd35, [%r13+64];
+ add.f64 %fd6, %fd6, %fd35;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB4_27:
+ setp.lt.u32 %p16, %r14, 8;
+ @%p16 bra BB4_29;
+
+ ld.volatile.shared.f64 %fd36, [%r13+32];
+ add.f64 %fd6, %fd6, %fd36;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB4_29:
+ setp.lt.u32 %p17, %r14, 4;
+ @%p17 bra BB4_31;
+
+ ld.volatile.shared.f64 %fd37, [%r13+16];
+ add.f64 %fd6, %fd6, %fd37;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB4_31:
+ setp.lt.u32 %p18, %r14, 2;
+ @%p18 bra BB4_33;
+
+ ld.volatile.shared.f64 %fd38, [%r13+8];
+ add.f64 %fd39, %fd6, %fd38;
+ st.volatile.shared.f64 [%r13], %fd39;
+
+BB4_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB4_35;
+
+ ld.shared.f64 %fd40, [memory];
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.u32 %rd7, %r6, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f64 [%rd8], %fd40;
+
+BB4_35:
+ ret;
+}
+
+ // .globl reduce_row_sum_f
+.visible .entry reduce_row_sum_f(
+ .param .u64 reduce_row_sum_f_param_0,
+ .param .u64 reduce_row_sum_f_param_1,
+ .param .u32 reduce_row_sum_f_param_2,
+ .param .u32 reduce_row_sum_f_param_3
+)
+{
+ .reg .pred %p<20>;
+ .reg .f32 %f<56>;
+ .reg .b32 %r<72>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [reduce_row_sum_f_param_0];
+ ld.param.u64 %rd2, [reduce_row_sum_f_param_1];
+ ld.param.u32 %r5, [reduce_row_sum_f_param_2];
+ ld.param.u32 %r4, [reduce_row_sum_f_param_3];
+ mov.u32 %r6, %ctaid.x;
+ setp.ge.u32 %p1, %r6, %r5;
+ @%p1 bra BB5_35;
+
+ mov.u32 %r71, %tid.x;
+ mov.f32 %f6, 0f00000000;
+ setp.ge.u32 %p2, %r71, %r4;
+ @%p2 bra BB5_4;
+
+ cvta.to.global.u64 %rd3, %rd1;
+
+BB5_3:
+ mad.lo.s32 %r8, %r6, %r4, %r71;
+ mul.wide.u32 %rd4, %r8, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f32 %f28, [%rd5];
+ add.f32 %f6, %f6, %f28;
+ mov.u32 %r9, %ntid.x;
+ add.s32 %r71, %r9, %r71;
+ setp.lt.u32 %p3, %r71, %r4;
+ @%p3 bra BB5_3;
+
+BB5_4:
+ mov.u32 %r10, %tid.x;
+ shl.b32 %r11, %r10, 2;
+ mov.u32 %r12, memory;
+ add.s32 %r13, %r12, %r11;
+ st.shared.f32 [%r13], %f6;
+ bar.sync 0;
+ mov.u32 %r14, %ntid.x;
+ setp.lt.u32 %p4, %r14, 1024;
+ @%p4 bra BB5_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB5_7;
+
+ ld.shared.f32 %f29, [%r13+2048];
+ add.f32 %f6, %f6, %f29;
+ st.shared.f32 [%r13], %f6;
+
+BB5_7:
+ bar.sync 0;
+
+BB5_8:
+ setp.lt.u32 %p6, %r14, 512;
+ @%p6 bra BB5_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB5_11;
+
+ ld.shared.f32 %f30, [%r13+1024];
+ add.f32 %f6, %f6, %f30;
+ st.shared.f32 [%r13], %f6;
+
+BB5_11:
+ bar.sync 0;
+
+BB5_12:
+ setp.lt.u32 %p8, %r14, 256;
+ @%p8 bra BB5_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB5_15;
+
+ ld.shared.f32 %f31, [%r13+512];
+ add.f32 %f6, %f6, %f31;
+ st.shared.f32 [%r13], %f6;
+
+BB5_15:
+ bar.sync 0;
+
+BB5_16:
+ setp.lt.u32 %p10, %r14, 128;
+ @%p10 bra BB5_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB5_19;
+
+ ld.shared.f32 %f32, [%r13+256];
+ add.f32 %f6, %f6, %f32;
+ st.shared.f32 [%r13], %f6;
+
+BB5_19:
+ bar.sync 0;
+
+BB5_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB5_33;
+
+ setp.lt.u32 %p13, %r14, 64;
+ @%p13 bra BB5_23;
+
+ ld.volatile.shared.f32 %f33, [%r13+128];
+ add.f32 %f6, %f6, %f33;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB5_23:
+ setp.lt.u32 %p14, %r14, 32;
+ @%p14 bra BB5_25;
+
+ ld.volatile.shared.f32 %f34, [%r13+64];
+ add.f32 %f6, %f6, %f34;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB5_25:
+ setp.lt.u32 %p15, %r14, 16;
+ @%p15 bra BB5_27;
+
+ ld.volatile.shared.f32 %f35, [%r13+32];
+ add.f32 %f6, %f6, %f35;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB5_27:
+ setp.lt.u32 %p16, %r14, 8;
+ @%p16 bra BB5_29;
+
+ ld.volatile.shared.f32 %f36, [%r13+16];
+ add.f32 %f6, %f6, %f36;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB5_29:
+ setp.lt.u32 %p17, %r14, 4;
+ @%p17 bra BB5_31;
+
+ ld.volatile.shared.f32 %f37, [%r13+8];
+ add.f32 %f6, %f6, %f37;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB5_31:
+ setp.lt.u32 %p18, %r14, 2;
+ @%p18 bra BB5_33;
+
+ ld.volatile.shared.f32 %f38, [%r13+4];
+ add.f32 %f39, %f6, %f38;
+ st.volatile.shared.f32 [%r13], %f39;
+
+BB5_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB5_35;
+
+ ld.shared.f32 %f40, [memory];
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.u32 %rd7, %r6, 4;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f32 [%rd8], %f40;
+
+BB5_35:
+ ret;
+}
+
+ // .globl reduce_col_sum_d
+.visible .entry reduce_col_sum_d(
+ .param .u64 reduce_col_sum_d_param_0,
+ .param .u64 reduce_col_sum_d_param_1,
+ .param .u32 reduce_col_sum_d_param_2,
+ .param .u32 reduce_col_sum_d_param_3
+)
+{
+ .reg .pred %p<4>;
+ .reg .b32 %r<11>;
+ .reg .f64 %fd<9>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd2, [reduce_col_sum_d_param_0];
+ ld.param.u64 %rd3, [reduce_col_sum_d_param_1];
+ ld.param.u32 %r5, [reduce_col_sum_d_param_2];
+ ld.param.u32 %r6, [reduce_col_sum_d_param_3];
+ mov.u32 %r7, %ntid.x;
+ mov.u32 %r8, %ctaid.x;
+ mov.u32 %r9, %tid.x;
+ mad.lo.s32 %r1, %r7, %r8, %r9;
+ setp.ge.u32 %p1, %r1, %r6;
+ @%p1 bra BB6_5;
+
+ mul.lo.s32 %r2, %r6, %r5;
+ cvta.to.global.u64 %rd1, %rd2;
+ mov.f64 %fd8, 0d0000000000000000;
+ setp.ge.u32 %p2, %r1, %r2;
+ @%p2 bra BB6_4;
+
+ mov.u32 %r10, %r1;
+
+BB6_3:
+ mul.wide.u32 %rd4, %r10, 8;
+ add.s64 %rd5, %rd1, %rd4;
+ ld.global.f64 %fd6, [%rd5];
+ add.f64 %fd8, %fd8, %fd6;
+ add.s32 %r10, %r10, %r6;
+ setp.lt.u32 %p3, %r10, %r2;
+ @%p3 bra BB6_3;
+
+BB6_4:
+ cvta.to.global.u64 %rd6, %rd3;
+ mul.wide.u32 %rd7, %r1, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f64 [%rd8], %fd8;
+
+BB6_5:
+ ret;
+}
+
+ // .globl reduce_col_sum_f
+.visible .entry reduce_col_sum_f(
+ .param .u64 reduce_col_sum_f_param_0,
+ .param .u64 reduce_col_sum_f_param_1,
+ .param .u32 reduce_col_sum_f_param_2,
+ .param .u32 reduce_col_sum_f_param_3
+)
+{
+ .reg .pred %p<4>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<11>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd2, [reduce_col_sum_f_param_0];
+ ld.param.u64 %rd3, [reduce_col_sum_f_param_1];
+ ld.param.u32 %r5, [reduce_col_sum_f_param_2];
+ ld.param.u32 %r6, [reduce_col_sum_f_param_3];
+ mov.u32 %r7, %ntid.x;
+ mov.u32 %r8, %ctaid.x;
+ mov.u32 %r9, %tid.x;
+ mad.lo.s32 %r1, %r7, %r8, %r9;
+ setp.ge.u32 %p1, %r1, %r6;
+ @%p1 bra BB7_5;
+
+ mul.lo.s32 %r2, %r6, %r5;
+ cvta.to.global.u64 %rd1, %rd2;
+ mov.f32 %f8, 0f00000000;
+ setp.ge.u32 %p2, %r1, %r2;
+ @%p2 bra BB7_4;
+
+ mov.u32 %r10, %r1;
+
+BB7_3:
+ mul.wide.u32 %rd4, %r10, 4;
+ add.s64 %rd5, %rd1, %rd4;
+ ld.global.f32 %f6, [%rd5];
+ add.f32 %f8, %f8, %f6;
+ add.s32 %r10, %r10, %r6;
+ setp.lt.u32 %p3, %r10, %r2;
+ @%p3 bra BB7_3;
+
+BB7_4:
+ cvta.to.global.u64 %rd6, %rd3;
+ mul.wide.u32 %rd7, %r1, 4;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f32 [%rd8], %f8;
+
+BB7_5:
+ ret;
+}
+
+ // .globl reduce_max_d
+.visible .entry reduce_max_d(
+ .param .u64 reduce_max_d_param_0,
+ .param .u64 reduce_max_d_param_1,
+ .param .u32 reduce_max_d_param_2
+)
+{
+ .local .align 16 .b8 __local_depot8[16];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<21>;
+ .reg .b32 %r<39>;
+ .reg .f64 %fd<61>;
+ .reg .b64 %rd<16>;
+
+
+ mov.u64 %SPL, __local_depot8;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd1, [reduce_max_d_param_0];
+ ld.param.u64 %rd2, [reduce_max_d_param_1];
+ ld.param.u32 %r6, [reduce_max_d_param_2];
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mov.u32 %r10, %tid.x;
+ mad.lo.s32 %r38, %r8, %r9, %r10;
+ mov.f64 %fd45, 0dFFEFFFFFFFFFFFFF;
+ setp.ge.u32 %p1, %r38, %r6;
+ @%p1 bra BB8_4;
+
+BB8_1:
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.u32 %rd4, %r38, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd30, [%rd5];
+ max.f64 %fd45, %fd45, %fd30;
+ add.s32 %r3, %r38, %r9;
+ setp.ge.u32 %p2, %r3, %r6;
+ @%p2 bra BB8_3;
+
+ mul.wide.u32 %rd7, %r3, 8;
+ add.s64 %rd8, %rd3, %rd7;
+ ld.global.f64 %fd31, [%rd8];
+ max.f64 %fd45, %fd45, %fd31;
+
+BB8_3:
+ shl.b32 %r13, %r9, 1;
+ mov.u32 %r14, %nctaid.x;
+ mad.lo.s32 %r38, %r13, %r14, %r38;
+ setp.lt.u32 %p3, %r38, %r6;
+ @%p3 bra BB8_1;
+
+BB8_4:
+ shl.b32 %r16, %r10, 3;
+ mov.u32 %r17, memory;
+ add.s32 %r5, %r17, %r16;
+ st.shared.f64 [%r5], %fd45;
+ bar.sync 0;
+ setp.lt.u32 %p4, %r9, 1024;
+ @%p4 bra BB8_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB8_7;
+
+ ld.shared.f64 %fd32, [%r5+4096];
+ max.f64 %fd45, %fd45, %fd32;
+ st.shared.f64 [%r5], %fd45;
+
+BB8_7:
+ bar.sync 0;
+
+BB8_8:
+ setp.lt.u32 %p6, %r9, 512;
+ @%p6 bra BB8_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB8_11;
+
+ ld.shared.f64 %fd33, [%r5+2048];
+ max.f64 %fd45, %fd45, %fd33;
+ st.shared.f64 [%r5], %fd45;
+
+BB8_11:
+ bar.sync 0;
+
+BB8_12:
+ setp.lt.u32 %p8, %r9, 256;
+ @%p8 bra BB8_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB8_15;
+
+ ld.shared.f64 %fd34, [%r5+1024];
+ max.f64 %fd45, %fd45, %fd34;
+ st.shared.f64 [%r5], %fd45;
+
+BB8_15:
+ bar.sync 0;
+
+BB8_16:
+ setp.lt.u32 %p10, %r9, 128;
+ @%p10 bra BB8_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB8_19;
+
+ ld.shared.f64 %fd35, [%r5+512];
+ max.f64 %fd45, %fd45, %fd35;
+ st.shared.f64 [%r5], %fd45;
+
+BB8_19:
+ bar.sync 0;
+
+BB8_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB8_33;
+
+ setp.lt.u32 %p13, %r9, 64;
+ @%p13 bra BB8_23;
+
+ ld.volatile.shared.f64 %fd36, [%r5+256];
+ max.f64 %fd45, %fd45, %fd36;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB8_23:
+ setp.lt.u32 %p14, %r9, 32;
+ @%p14 bra BB8_25;
+
+ ld.volatile.shared.f64 %fd37, [%r5+128];
+ max.f64 %fd45, %fd45, %fd37;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB8_25:
+ setp.lt.u32 %p15, %r9, 16;
+ @%p15 bra BB8_27;
+
+ ld.volatile.shared.f64 %fd38, [%r5+64];
+ max.f64 %fd45, %fd45, %fd38;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB8_27:
+ setp.lt.u32 %p16, %r9, 8;
+ @%p16 bra BB8_29;
+
+ ld.volatile.shared.f64 %fd39, [%r5+32];
+ max.f64 %fd45, %fd45, %fd39;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB8_29:
+ setp.lt.u32 %p17, %r9, 4;
+ @%p17 bra BB8_31;
+
+ ld.volatile.shared.f64 %fd40, [%r5+16];
+ max.f64 %fd45, %fd45, %fd40;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB8_31:
+ setp.lt.u32 %p18, %r9, 2;
+ @%p18 bra BB8_33;
+
+ ld.volatile.shared.f64 %fd41, [%r5+8];
+ max.f64 %fd42, %fd45, %fd41;
+ st.volatile.shared.f64 [%r5], %fd42;
+
+BB8_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB8_37;
+
+ mov.u32 %r34, %nctaid.x;
+ setp.gt.u32 %p20, %r34, 9;
+ @%p20 bra BB8_36;
+
+ ld.shared.f64 %fd43, [memory];
+ add.u64 %rd9, %SP, 0;
+ add.u64 %rd10, %SPL, 0;
+ st.local.u32 [%rd10], %r7;
+ st.local.f64 [%rd10+8], %fd43;
+ mov.u64 %rd11, $str;
+ cvta.global.u64 %rd12, %rd11;
+ // Callseq Start 2
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd12;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd9;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r36, [retval0+0];
+
+ //{
+ }// Callseq End 2
+
+BB8_36:
+ ld.shared.f64 %fd44, [memory];
+ cvta.to.global.u64 %rd13, %rd2;
+ mul.wide.u32 %rd14, %r7, 8;
+ add.s64 %rd15, %rd13, %rd14;
+ st.global.f64 [%rd15], %fd44;
+
+BB8_37:
+ ret;
+}
+
+ // .globl reduce_max_f
+.visible .entry reduce_max_f(
+ .param .u64 reduce_max_f_param_0,
+ .param .u64 reduce_max_f_param_1,
+ .param .u32 reduce_max_f_param_2
+)
+{
+ .local .align 16 .b8 __local_depot9[16];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<21>;
+ .reg .f32 %f<61>;
+ .reg .b32 %r<39>;
+ .reg .f64 %fd<2>;
+ .reg .b64 %rd<16>;
+
+
+ mov.u64 %SPL, __local_depot9;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd1, [reduce_max_f_param_0];
+ ld.param.u64 %rd2, [reduce_max_f_param_1];
+ ld.param.u32 %r6, [reduce_max_f_param_2];
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mov.u32 %r10, %tid.x;
+ mad.lo.s32 %r38, %r8, %r9, %r10;
+ mov.f32 %f45, 0fFF7FFFFF;
+ setp.ge.u32 %p1, %r38, %r6;
+ @%p1 bra BB9_4;
+
+BB9_1:
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.u32 %rd4, %r38, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f32 %f30, [%rd5];
+ max.f32 %f45, %f45, %f30;
+ add.s32 %r3, %r38, %r9;
+ setp.ge.u32 %p2, %r3, %r6;
+ @%p2 bra BB9_3;
+
+ mul.wide.u32 %rd7, %r3, 4;
+ add.s64 %rd8, %rd3, %rd7;
+ ld.global.f32 %f31, [%rd8];
+ max.f32 %f45, %f45, %f31;
+
+BB9_3:
+ shl.b32 %r13, %r9, 1;
+ mov.u32 %r14, %nctaid.x;
+ mad.lo.s32 %r38, %r13, %r14, %r38;
+ setp.lt.u32 %p3, %r38, %r6;
+ @%p3 bra BB9_1;
+
+BB9_4:
+ shl.b32 %r16, %r10, 2;
+ mov.u32 %r17, memory;
+ add.s32 %r5, %r17, %r16;
+ st.shared.f32 [%r5], %f45;
+ bar.sync 0;
+ setp.lt.u32 %p4, %r9, 1024;
+ @%p4 bra BB9_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB9_7;
+
+ ld.shared.f32 %f32, [%r5+2048];
+ max.f32 %f45, %f45, %f32;
+ st.shared.f32 [%r5], %f45;
+
+BB9_7:
+ bar.sync 0;
+
+BB9_8:
+ setp.lt.u32 %p6, %r9, 512;
+ @%p6 bra BB9_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB9_11;
+
+ ld.shared.f32 %f33, [%r5+1024];
+ max.f32 %f45, %f45, %f33;
+ st.shared.f32 [%r5], %f45;
+
+BB9_11:
+ bar.sync 0;
+
+BB9_12:
+ setp.lt.u32 %p8, %r9, 256;
+ @%p8 bra BB9_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB9_15;
+
+ ld.shared.f32 %f34, [%r5+512];
+ max.f32 %f45, %f45, %f34;
+ st.shared.f32 [%r5], %f45;
+
+BB9_15:
+ bar.sync 0;
+
+BB9_16:
+ setp.lt.u32 %p10, %r9, 128;
+ @%p10 bra BB9_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB9_19;
+
+ ld.shared.f32 %f35, [%r5+256];
+ max.f32 %f45, %f45, %f35;
+ st.shared.f32 [%r5], %f45;
+
+BB9_19:
+ bar.sync 0;
+
+BB9_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB9_33;
+
+ setp.lt.u32 %p13, %r9, 64;
+ @%p13 bra BB9_23;
+
+ ld.volatile.shared.f32 %f36, [%r5+128];
+ max.f32 %f45, %f45, %f36;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB9_23:
+ setp.lt.u32 %p14, %r9, 32;
+ @%p14 bra BB9_25;
+
+ ld.volatile.shared.f32 %f37, [%r5+64];
+ max.f32 %f45, %f45, %f37;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB9_25:
+ setp.lt.u32 %p15, %r9, 16;
+ @%p15 bra BB9_27;
+
+ ld.volatile.shared.f32 %f38, [%r5+32];
+ max.f32 %f45, %f45, %f38;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB9_27:
+ setp.lt.u32 %p16, %r9, 8;
+ @%p16 bra BB9_29;
+
+ ld.volatile.shared.f32 %f39, [%r5+16];
+ max.f32 %f45, %f45, %f39;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB9_29:
+ setp.lt.u32 %p17, %r9, 4;
+ @%p17 bra BB9_31;
+
+ ld.volatile.shared.f32 %f40, [%r5+8];
+ max.f32 %f45, %f45, %f40;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB9_31:
+ setp.lt.u32 %p18, %r9, 2;
+ @%p18 bra BB9_33;
+
+ ld.volatile.shared.f32 %f41, [%r5+4];
+ max.f32 %f42, %f45, %f41;
+ st.volatile.shared.f32 [%r5], %f42;
+
+BB9_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB9_37;
+
+ mov.u32 %r34, %nctaid.x;
+ setp.gt.u32 %p20, %r34, 9;
+ @%p20 bra BB9_36;
+
+ ld.shared.f32 %f43, [memory];
+ cvt.f64.f32 %fd1, %f43;
+ add.u64 %rd9, %SP, 0;
+ add.u64 %rd10, %SPL, 0;
+ st.local.u32 [%rd10], %r7;
+ st.local.f64 [%rd10+8], %fd1;
+ mov.u64 %rd11, $str;
+ cvta.global.u64 %rd12, %rd11;
+ // Callseq Start 3
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd12;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd9;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r36, [retval0+0];
+
+ //{
+ }// Callseq End 3
+
+BB9_36:
+ ld.shared.f32 %f44, [memory];
+ cvta.to.global.u64 %rd13, %rd2;
+ mul.wide.u32 %rd14, %r7, 4;
+ add.s64 %rd15, %rd13, %rd14;
+ st.global.f32 [%rd15], %f44;
+
+BB9_37:
+ ret;
+}
+
+ // .globl reduce_row_max_d
+.visible .entry reduce_row_max_d(
+ .param .u64 reduce_row_max_d_param_0,
+ .param .u64 reduce_row_max_d_param_1,
+ .param .u32 reduce_row_max_d_param_2,
+ .param .u32 reduce_row_max_d_param_3
+)
+{
+ .reg .pred %p<20>;
+ .reg .b32 %r<72>;
+ .reg .f64 %fd<56>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [reduce_row_max_d_param_0];
+ ld.param.u64 %rd2, [reduce_row_max_d_param_1];
+ ld.param.u32 %r5, [reduce_row_max_d_param_2];
+ ld.param.u32 %r4, [reduce_row_max_d_param_3];
+ mov.u32 %r6, %ctaid.x;
+ setp.ge.u32 %p1, %r6, %r5;
+ @%p1 bra BB10_35;
+
+ mov.u32 %r71, %tid.x;
+ mov.f64 %fd6, 0dFFEFFFFFFFFFFFFF;
+ setp.ge.u32 %p2, %r71, %r4;
+ @%p2 bra BB10_4;
+
+ cvta.to.global.u64 %rd3, %rd1;
+
+BB10_3:
+ mad.lo.s32 %r8, %r6, %r4, %r71;
+ mul.wide.u32 %rd4, %r8, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd28, [%rd5];
+ max.f64 %fd6, %fd6, %fd28;
+ mov.u32 %r9, %ntid.x;
+ add.s32 %r71, %r9, %r71;
+ setp.lt.u32 %p3, %r71, %r4;
+ @%p3 bra BB10_3;
+
+BB10_4:
+ mov.u32 %r10, %tid.x;
+ shl.b32 %r11, %r10, 3;
+ mov.u32 %r12, memory;
+ add.s32 %r13, %r12, %r11;
+ st.shared.f64 [%r13], %fd6;
+ bar.sync 0;
+ mov.u32 %r14, %ntid.x;
+ setp.lt.u32 %p4, %r14, 1024;
+ @%p4 bra BB10_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB10_7;
+
+ ld.shared.f64 %fd29, [%r13+4096];
+ max.f64 %fd6, %fd6, %fd29;
+ st.shared.f64 [%r13], %fd6;
+
+BB10_7:
+ bar.sync 0;
+
+BB10_8:
+ setp.lt.u32 %p6, %r14, 512;
+ @%p6 bra BB10_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB10_11;
+
+ ld.shared.f64 %fd30, [%r13+2048];
+ max.f64 %fd6, %fd6, %fd30;
+ st.shared.f64 [%r13], %fd6;
+
+BB10_11:
+ bar.sync 0;
+
+BB10_12:
+ setp.lt.u32 %p8, %r14, 256;
+ @%p8 bra BB10_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB10_15;
+
+ ld.shared.f64 %fd31, [%r13+1024];
+ max.f64 %fd6, %fd6, %fd31;
+ st.shared.f64 [%r13], %fd6;
+
+BB10_15:
+ bar.sync 0;
+
+BB10_16:
+ setp.lt.u32 %p10, %r14, 128;
+ @%p10 bra BB10_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB10_19;
+
+ ld.shared.f64 %fd32, [%r13+512];
+ max.f64 %fd6, %fd6, %fd32;
+ st.shared.f64 [%r13], %fd6;
+
+BB10_19:
+ bar.sync 0;
+
+BB10_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB10_33;
+
+ setp.lt.u32 %p13, %r14, 64;
+ @%p13 bra BB10_23;
+
+ ld.volatile.shared.f64 %fd33, [%r13+256];
+ max.f64 %fd6, %fd6, %fd33;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB10_23:
+ setp.lt.u32 %p14, %r14, 32;
+ @%p14 bra BB10_25;
+
+ ld.volatile.shared.f64 %fd34, [%r13+128];
+ max.f64 %fd6, %fd6, %fd34;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB10_25:
+ setp.lt.u32 %p15, %r14, 16;
+ @%p15 bra BB10_27;
+
+ ld.volatile.shared.f64 %fd35, [%r13+64];
+ max.f64 %fd6, %fd6, %fd35;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB10_27:
+ setp.lt.u32 %p16, %r14, 8;
+ @%p16 bra BB10_29;
+
+ ld.volatile.shared.f64 %fd36, [%r13+32];
+ max.f64 %fd6, %fd6, %fd36;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB10_29:
+ setp.lt.u32 %p17, %r14, 4;
+ @%p17 bra BB10_31;
+
+ ld.volatile.shared.f64 %fd37, [%r13+16];
+ max.f64 %fd6, %fd6, %fd37;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB10_31:
+ setp.lt.u32 %p18, %r14, 2;
+ @%p18 bra BB10_33;
+
+ ld.volatile.shared.f64 %fd38, [%r13+8];
+ max.f64 %fd39, %fd6, %fd38;
+ st.volatile.shared.f64 [%r13], %fd39;
+
+BB10_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB10_35;
+
+ ld.shared.f64 %fd40, [memory];
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.u32 %rd7, %r6, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f64 [%rd8], %fd40;
+
+BB10_35:
+ ret;
+}
+
+ // .globl reduce_row_max_f
+.visible .entry reduce_row_max_f(
+ .param .u64 reduce_row_max_f_param_0,
+ .param .u64 reduce_row_max_f_param_1,
+ .param .u32 reduce_row_max_f_param_2,
+ .param .u32 reduce_row_max_f_param_3
+)
+{
+ .reg .pred %p<20>;
+ .reg .f32 %f<56>;
+ .reg .b32 %r<72>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [reduce_row_max_f_param_0];
+ ld.param.u64 %rd2, [reduce_row_max_f_param_1];
+ ld.param.u32 %r5, [reduce_row_max_f_param_2];
+ ld.param.u32 %r4, [reduce_row_max_f_param_3];
+ mov.u32 %r6, %ctaid.x;
+ setp.ge.u32 %p1, %r6, %r5;
+ @%p1 bra BB11_35;
+
+ mov.u32 %r71, %tid.x;
+ mov.f32 %f6, 0fFF7FFFFF;
+ setp.ge.u32 %p2, %r71, %r4;
+ @%p2 bra BB11_4;
+
+ cvta.to.global.u64 %rd3, %rd1;
+
+BB11_3:
+ mad.lo.s32 %r8, %r6, %r4, %r71;
+ mul.wide.u32 %rd4, %r8, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f32 %f28, [%rd5];
+ max.f32 %f6, %f6, %f28;
+ mov.u32 %r9, %ntid.x;
+ add.s32 %r71, %r9, %r71;
+ setp.lt.u32 %p3, %r71, %r4;
+ @%p3 bra BB11_3;
+
+BB11_4:
+ mov.u32 %r10, %tid.x;
+ shl.b32 %r11, %r10, 2;
+ mov.u32 %r12, memory;
+ add.s32 %r13, %r12, %r11;
+ st.shared.f32 [%r13], %f6;
+ bar.sync 0;
+ mov.u32 %r14, %ntid.x;
+ setp.lt.u32 %p4, %r14, 1024;
+ @%p4 bra BB11_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB11_7;
+
+ ld.shared.f32 %f29, [%r13+2048];
+ max.f32 %f6, %f6, %f29;
+ st.shared.f32 [%r13], %f6;
+
+BB11_7:
+ bar.sync 0;
+
+BB11_8:
+ setp.lt.u32 %p6, %r14, 512;
+ @%p6 bra BB11_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB11_11;
+
+ ld.shared.f32 %f30, [%r13+1024];
+ max.f32 %f6, %f6, %f30;
+ st.shared.f32 [%r13], %f6;
+
+BB11_11:
+ bar.sync 0;
+
+BB11_12:
+ setp.lt.u32 %p8, %r14, 256;
+ @%p8 bra BB11_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB11_15;
+
+ ld.shared.f32 %f31, [%r13+512];
+ max.f32 %f6, %f6, %f31;
+ st.shared.f32 [%r13], %f6;
+
+BB11_15:
+ bar.sync 0;
+
+BB11_16:
+ setp.lt.u32 %p10, %r14, 128;
+ @%p10 bra BB11_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB11_19;
+
+ ld.shared.f32 %f32, [%r13+256];
+ max.f32 %f6, %f6, %f32;
+ st.shared.f32 [%r13], %f6;
+
+BB11_19:
+ bar.sync 0;
+
+BB11_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB11_33;
+
+ setp.lt.u32 %p13, %r14, 64;
+ @%p13 bra BB11_23;
+
+ ld.volatile.shared.f32 %f33, [%r13+128];
+ max.f32 %f6, %f6, %f33;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB11_23:
+ setp.lt.u32 %p14, %r14, 32;
+ @%p14 bra BB11_25;
+
+ ld.volatile.shared.f32 %f34, [%r13+64];
+ max.f32 %f6, %f6, %f34;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB11_25:
+ setp.lt.u32 %p15, %r14, 16;
+ @%p15 bra BB11_27;
+
+ ld.volatile.shared.f32 %f35, [%r13+32];
+ max.f32 %f6, %f6, %f35;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB11_27:
+ setp.lt.u32 %p16, %r14, 8;
+ @%p16 bra BB11_29;
+
+ ld.volatile.shared.f32 %f36, [%r13+16];
+ max.f32 %f6, %f6, %f36;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB11_29:
+ setp.lt.u32 %p17, %r14, 4;
+ @%p17 bra BB11_31;
+
+ ld.volatile.shared.f32 %f37, [%r13+8];
+ max.f32 %f6, %f6, %f37;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB11_31:
+ setp.lt.u32 %p18, %r14, 2;
+ @%p18 bra BB11_33;
+
+ ld.volatile.shared.f32 %f38, [%r13+4];
+ max.f32 %f39, %f6, %f38;
+ st.volatile.shared.f32 [%r13], %f39;
+
+BB11_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB11_35;
+
+ ld.shared.f32 %f40, [memory];
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.u32 %rd7, %r6, 4;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f32 [%rd8], %f40;
+
+BB11_35:
+ ret;
+}
+
+ // .globl reduce_col_max_d
+.visible .entry reduce_col_max_d(
+ .param .u64 reduce_col_max_d_param_0,
+ .param .u64 reduce_col_max_d_param_1,
+ .param .u32 reduce_col_max_d_param_2,
+ .param .u32 reduce_col_max_d_param_3
+)
+{
+ .reg .pred %p<4>;
+ .reg .b32 %r<11>;
+ .reg .f64 %fd<9>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd2, [reduce_col_max_d_param_0];
+ ld.param.u64 %rd3, [reduce_col_max_d_param_1];
+ ld.param.u32 %r5, [reduce_col_max_d_param_2];
+ ld.param.u32 %r6, [reduce_col_max_d_param_3];
+ mov.u32 %r7, %ntid.x;
+ mov.u32 %r8, %ctaid.x;
+ mov.u32 %r9, %tid.x;
+ mad.lo.s32 %r1, %r7, %r8, %r9;
+ setp.ge.u32 %p1, %r1, %r6;
+ @%p1 bra BB12_5;
+
+ mul.lo.s32 %r2, %r6, %r5;
+ cvta.to.global.u64 %rd1, %rd2;
+ mov.f64 %fd8, 0dFFEFFFFFFFFFFFFF;
+ setp.ge.u32 %p2, %r1, %r2;
+ @%p2 bra BB12_4;
+
+ mov.u32 %r10, %r1;
+
+BB12_3:
+ mul.wide.u32 %rd4, %r10, 8;
+ add.s64 %rd5, %rd1, %rd4;
+ ld.global.f64 %fd6, [%rd5];
+ max.f64 %fd8, %fd8, %fd6;
+ add.s32 %r10, %r10, %r6;
+ setp.lt.u32 %p3, %r10, %r2;
+ @%p3 bra BB12_3;
+
+BB12_4:
+ cvta.to.global.u64 %rd6, %rd3;
+ mul.wide.u32 %rd7, %r1, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f64 [%rd8], %fd8;
+
+BB12_5:
+ ret;
+}
+
+ // .globl reduce_col_max_f
+.visible .entry reduce_col_max_f(
+ .param .u64 reduce_col_max_f_param_0,
+ .param .u64 reduce_col_max_f_param_1,
+ .param .u32 reduce_col_max_f_param_2,
+ .param .u32 reduce_col_max_f_param_3
+)
+{
+ .reg .pred %p<4>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<11>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd2, [reduce_col_max_f_param_0];
+ ld.param.u64 %rd3, [reduce_col_max_f_param_1];
+ ld.param.u32 %r5, [reduce_col_max_f_param_2];
+ ld.param.u32 %r6, [reduce_col_max_f_param_3];
+ mov.u32 %r7, %ntid.x;
+ mov.u32 %r8, %ctaid.x;
+ mov.u32 %r9, %tid.x;
+ mad.lo.s32 %r1, %r7, %r8, %r9;
+ setp.ge.u32 %p1, %r1, %r6;
+ @%p1 bra BB13_5;
+
+ mul.lo.s32 %r2, %r6, %r5;
+ cvta.to.global.u64 %rd1, %rd2;
+ mov.f32 %f8, 0fFF7FFFFF;
+ setp.ge.u32 %p2, %r1, %r2;
+ @%p2 bra BB13_4;
+
+ mov.u32 %r10, %r1;
+
+BB13_3:
+ mul.wide.u32 %rd4, %r10, 4;
+ add.s64 %rd5, %rd1, %rd4;
+ ld.global.f32 %f6, [%rd5];
+ max.f32 %f8, %f8, %f6;
+ add.s32 %r10, %r10, %r6;
+ setp.lt.u32 %p3, %r10, %r2;
+ @%p3 bra BB13_3;
+
+BB13_4:
+ cvta.to.global.u64 %rd6, %rd3;
+ mul.wide.u32 %rd7, %r1, 4;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f32 [%rd8], %f8;
+
+BB13_5:
+ ret;
+}
+
+ // .globl reduce_min_d
+.visible .entry reduce_min_d(
+ .param .u64 reduce_min_d_param_0,
+ .param .u64 reduce_min_d_param_1,
+ .param .u32 reduce_min_d_param_2
+)
+{
+ .local .align 16 .b8 __local_depot14[16];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<21>;
+ .reg .b32 %r<39>;
+ .reg .f64 %fd<61>;
+ .reg .b64 %rd<16>;
+
+
+ mov.u64 %SPL, __local_depot14;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd1, [reduce_min_d_param_0];
+ ld.param.u64 %rd2, [reduce_min_d_param_1];
+ ld.param.u32 %r6, [reduce_min_d_param_2];
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mov.u32 %r10, %tid.x;
+ mad.lo.s32 %r38, %r8, %r9, %r10;
+ mov.f64 %fd45, 0d7FEFFFFFFFFFFFFF;
+ setp.ge.u32 %p1, %r38, %r6;
+ @%p1 bra BB14_4;
+
+BB14_1:
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.u32 %rd4, %r38, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd30, [%rd5];
+ min.f64 %fd45, %fd45, %fd30;
+ add.s32 %r3, %r38, %r9;
+ setp.ge.u32 %p2, %r3, %r6;
+ @%p2 bra BB14_3;
+
+ mul.wide.u32 %rd7, %r3, 8;
+ add.s64 %rd8, %rd3, %rd7;
+ ld.global.f64 %fd31, [%rd8];
+ min.f64 %fd45, %fd45, %fd31;
+
+BB14_3:
+ shl.b32 %r13, %r9, 1;
+ mov.u32 %r14, %nctaid.x;
+ mad.lo.s32 %r38, %r13, %r14, %r38;
+ setp.lt.u32 %p3, %r38, %r6;
+ @%p3 bra BB14_1;
+
+BB14_4:
+ shl.b32 %r16, %r10, 3;
+ mov.u32 %r17, memory;
+ add.s32 %r5, %r17, %r16;
+ st.shared.f64 [%r5], %fd45;
+ bar.sync 0;
+ setp.lt.u32 %p4, %r9, 1024;
+ @%p4 bra BB14_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB14_7;
+
+ ld.shared.f64 %fd32, [%r5+4096];
+ min.f64 %fd45, %fd45, %fd32;
+ st.shared.f64 [%r5], %fd45;
+
+BB14_7:
+ bar.sync 0;
+
+BB14_8:
+ setp.lt.u32 %p6, %r9, 512;
+ @%p6 bra BB14_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB14_11;
+
+ ld.shared.f64 %fd33, [%r5+2048];
+ min.f64 %fd45, %fd45, %fd33;
+ st.shared.f64 [%r5], %fd45;
+
+BB14_11:
+ bar.sync 0;
+
+BB14_12:
+ setp.lt.u32 %p8, %r9, 256;
+ @%p8 bra BB14_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB14_15;
+
+ ld.shared.f64 %fd34, [%r5+1024];
+ min.f64 %fd45, %fd45, %fd34;
+ st.shared.f64 [%r5], %fd45;
+
+BB14_15:
+ bar.sync 0;
+
+BB14_16:
+ setp.lt.u32 %p10, %r9, 128;
+ @%p10 bra BB14_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB14_19;
+
+ ld.shared.f64 %fd35, [%r5+512];
+ min.f64 %fd45, %fd45, %fd35;
+ st.shared.f64 [%r5], %fd45;
+
+BB14_19:
+ bar.sync 0;
+
+BB14_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB14_33;
+
+ setp.lt.u32 %p13, %r9, 64;
+ @%p13 bra BB14_23;
+
+ ld.volatile.shared.f64 %fd36, [%r5+256];
+ min.f64 %fd45, %fd45, %fd36;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB14_23:
+ setp.lt.u32 %p14, %r9, 32;
+ @%p14 bra BB14_25;
+
+ ld.volatile.shared.f64 %fd37, [%r5+128];
+ min.f64 %fd45, %fd45, %fd37;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB14_25:
+ setp.lt.u32 %p15, %r9, 16;
+ @%p15 bra BB14_27;
+
+ ld.volatile.shared.f64 %fd38, [%r5+64];
+ min.f64 %fd45, %fd45, %fd38;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB14_27:
+ setp.lt.u32 %p16, %r9, 8;
+ @%p16 bra BB14_29;
+
+ ld.volatile.shared.f64 %fd39, [%r5+32];
+ min.f64 %fd45, %fd45, %fd39;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB14_29:
+ setp.lt.u32 %p17, %r9, 4;
+ @%p17 bra BB14_31;
+
+ ld.volatile.shared.f64 %fd40, [%r5+16];
+ min.f64 %fd45, %fd45, %fd40;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB14_31:
+ setp.lt.u32 %p18, %r9, 2;
+ @%p18 bra BB14_33;
+
+ ld.volatile.shared.f64 %fd41, [%r5+8];
+ min.f64 %fd42, %fd45, %fd41;
+ st.volatile.shared.f64 [%r5], %fd42;
+
+BB14_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB14_37;
+
+ mov.u32 %r34, %nctaid.x;
+ setp.gt.u32 %p20, %r34, 9;
+ @%p20 bra BB14_36;
+
+ ld.shared.f64 %fd43, [memory];
+ add.u64 %rd9, %SP, 0;
+ add.u64 %rd10, %SPL, 0;
+ st.local.u32 [%rd10], %r7;
+ st.local.f64 [%rd10+8], %fd43;
+ mov.u64 %rd11, $str;
+ cvta.global.u64 %rd12, %rd11;
+ // Callseq Start 4
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd12;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd9;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r36, [retval0+0];
+
+ //{
+ }// Callseq End 4
+
+BB14_36:
+ ld.shared.f64 %fd44, [memory];
+ cvta.to.global.u64 %rd13, %rd2;
+ mul.wide.u32 %rd14, %r7, 8;
+ add.s64 %rd15, %rd13, %rd14;
+ st.global.f64 [%rd15], %fd44;
+
+BB14_37:
+ ret;
+}
+
+ // .globl reduce_min_f
+.visible .entry reduce_min_f(
+ .param .u64 reduce_min_f_param_0,
+ .param .u64 reduce_min_f_param_1,
+ .param .u32 reduce_min_f_param_2
+)
+{
+ .local .align 16 .b8 __local_depot15[16];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<21>;
+ .reg .f32 %f<61>;
+ .reg .b32 %r<39>;
+ .reg .f64 %fd<2>;
+ .reg .b64 %rd<16>;
+
+
+ mov.u64 %SPL, __local_depot15;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd1, [reduce_min_f_param_0];
+ ld.param.u64 %rd2, [reduce_min_f_param_1];
+ ld.param.u32 %r6, [reduce_min_f_param_2];
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mov.u32 %r10, %tid.x;
+ mad.lo.s32 %r38, %r8, %r9, %r10;
+ mov.f32 %f45, 0f7F7FFFFF;
+ setp.ge.u32 %p1, %r38, %r6;
+ @%p1 bra BB15_4;
+
+BB15_1:
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.u32 %rd4, %r38, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f32 %f30, [%rd5];
+ min.f32 %f45, %f45, %f30;
+ add.s32 %r3, %r38, %r9;
+ setp.ge.u32 %p2, %r3, %r6;
+ @%p2 bra BB15_3;
+
+ mul.wide.u32 %rd7, %r3, 4;
+ add.s64 %rd8, %rd3, %rd7;
+ ld.global.f32 %f31, [%rd8];
+ min.f32 %f45, %f45, %f31;
+
+BB15_3:
+ shl.b32 %r13, %r9, 1;
+ mov.u32 %r14, %nctaid.x;
+ mad.lo.s32 %r38, %r13, %r14, %r38;
+ setp.lt.u32 %p3, %r38, %r6;
+ @%p3 bra BB15_1;
+
+BB15_4:
+ shl.b32 %r16, %r10, 2;
+ mov.u32 %r17, memory;
+ add.s32 %r5, %r17, %r16;
+ st.shared.f32 [%r5], %f45;
+ bar.sync 0;
+ setp.lt.u32 %p4, %r9, 1024;
+ @%p4 bra BB15_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB15_7;
+
+ ld.shared.f32 %f32, [%r5+2048];
+ min.f32 %f45, %f45, %f32;
+ st.shared.f32 [%r5], %f45;
+
+BB15_7:
+ bar.sync 0;
+
+BB15_8:
+ setp.lt.u32 %p6, %r9, 512;
+ @%p6 bra BB15_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB15_11;
+
+ ld.shared.f32 %f33, [%r5+1024];
+ min.f32 %f45, %f45, %f33;
+ st.shared.f32 [%r5], %f45;
+
+BB15_11:
+ bar.sync 0;
+
+BB15_12:
+ setp.lt.u32 %p8, %r9, 256;
+ @%p8 bra BB15_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB15_15;
+
+ ld.shared.f32 %f34, [%r5+512];
+ min.f32 %f45, %f45, %f34;
+ st.shared.f32 [%r5], %f45;
+
+BB15_15:
+ bar.sync 0;
+
+BB15_16:
+ setp.lt.u32 %p10, %r9, 128;
+ @%p10 bra BB15_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB15_19;
+
+ ld.shared.f32 %f35, [%r5+256];
+ min.f32 %f45, %f45, %f35;
+ st.shared.f32 [%r5], %f45;
+
+BB15_19:
+ bar.sync 0;
+
+BB15_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB15_33;
+
+ setp.lt.u32 %p13, %r9, 64;
+ @%p13 bra BB15_23;
+
+ ld.volatile.shared.f32 %f36, [%r5+128];
+ min.f32 %f45, %f45, %f36;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB15_23:
+ setp.lt.u32 %p14, %r9, 32;
+ @%p14 bra BB15_25;
+
+ ld.volatile.shared.f32 %f37, [%r5+64];
+ min.f32 %f45, %f45, %f37;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB15_25:
+ setp.lt.u32 %p15, %r9, 16;
+ @%p15 bra BB15_27;
+
+ ld.volatile.shared.f32 %f38, [%r5+32];
+ min.f32 %f45, %f45, %f38;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB15_27:
+ setp.lt.u32 %p16, %r9, 8;
+ @%p16 bra BB15_29;
+
+ ld.volatile.shared.f32 %f39, [%r5+16];
+ min.f32 %f45, %f45, %f39;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB15_29:
+ setp.lt.u32 %p17, %r9, 4;
+ @%p17 bra BB15_31;
+
+ ld.volatile.shared.f32 %f40, [%r5+8];
+ min.f32 %f45, %f45, %f40;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB15_31:
+ setp.lt.u32 %p18, %r9, 2;
+ @%p18 bra BB15_33;
+
+ ld.volatile.shared.f32 %f41, [%r5+4];
+ min.f32 %f42, %f45, %f41;
+ st.volatile.shared.f32 [%r5], %f42;
+
+BB15_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB15_37;
+
+ mov.u32 %r34, %nctaid.x;
+ setp.gt.u32 %p20, %r34, 9;
+ @%p20 bra BB15_36;
+
+ ld.shared.f32 %f43, [memory];
+ cvt.f64.f32 %fd1, %f43;
+ add.u64 %rd9, %SP, 0;
+ add.u64 %rd10, %SPL, 0;
+ st.local.u32 [%rd10], %r7;
+ st.local.f64 [%rd10+8], %fd1;
+ mov.u64 %rd11, $str;
+ cvta.global.u64 %rd12, %rd11;
+ // Callseq Start 5
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd12;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd9;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r36, [retval0+0];
+
+ //{
+ }// Callseq End 5
+
+BB15_36:
+ ld.shared.f32 %f44, [memory];
+ cvta.to.global.u64 %rd13, %rd2;
+ mul.wide.u32 %rd14, %r7, 4;
+ add.s64 %rd15, %rd13, %rd14;
+ st.global.f32 [%rd15], %f44;
+
+BB15_37:
+ ret;
+}
+
+ // .globl reduce_row_min_d
+.visible .entry reduce_row_min_d(
+ .param .u64 reduce_row_min_d_param_0,
+ .param .u64 reduce_row_min_d_param_1,
+ .param .u32 reduce_row_min_d_param_2,
+ .param .u32 reduce_row_min_d_param_3
+)
+{
+ .reg .pred %p<20>;
+ .reg .b32 %r<72>;
+ .reg .f64 %fd<56>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [reduce_row_min_d_param_0];
+ ld.param.u64 %rd2, [reduce_row_min_d_param_1];
+ ld.param.u32 %r5, [reduce_row_min_d_param_2];
+ ld.param.u32 %r4, [reduce_row_min_d_param_3];
+ mov.u32 %r6, %ctaid.x;
+ setp.ge.u32 %p1, %r6, %r5;
+ @%p1 bra BB16_35;
+
+ mov.u32 %r71, %tid.x;
+ mov.f64 %fd6, 0d7FEFFFFFFFFFFFFF;
+ setp.ge.u32 %p2, %r71, %r4;
+ @%p2 bra BB16_4;
+
+ cvta.to.global.u64 %rd3, %rd1;
+
+BB16_3:
+ mad.lo.s32 %r8, %r6, %r4, %r71;
+ mul.wide.u32 %rd4, %r8, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd28, [%rd5];
+ min.f64 %fd6, %fd6, %fd28;
+ mov.u32 %r9, %ntid.x;
+ add.s32 %r71, %r9, %r71;
+ setp.lt.u32 %p3, %r71, %r4;
+ @%p3 bra BB16_3;
+
+BB16_4:
+ mov.u32 %r10, %tid.x;
+ shl.b32 %r11, %r10, 3;
+ mov.u32 %r12, memory;
+ add.s32 %r13, %r12, %r11;
+ st.shared.f64 [%r13], %fd6;
+ bar.sync 0;
+ mov.u32 %r14, %ntid.x;
+ setp.lt.u32 %p4, %r14, 1024;
+ @%p4 bra BB16_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB16_7;
+
+ ld.shared.f64 %fd29, [%r13+4096];
+ min.f64 %fd6, %fd6, %fd29;
+ st.shared.f64 [%r13], %fd6;
+
+BB16_7:
+ bar.sync 0;
+
+BB16_8:
+ setp.lt.u32 %p6, %r14, 512;
+ @%p6 bra BB16_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB16_11;
+
+ ld.shared.f64 %fd30, [%r13+2048];
+ min.f64 %fd6, %fd6, %fd30;
+ st.shared.f64 [%r13], %fd6;
+
+BB16_11:
+ bar.sync 0;
+
+BB16_12:
+ setp.lt.u32 %p8, %r14, 256;
+ @%p8 bra BB16_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB16_15;
+
+ ld.shared.f64 %fd31, [%r13+1024];
+ min.f64 %fd6, %fd6, %fd31;
+ st.shared.f64 [%r13], %fd6;
+
+BB16_15:
+ bar.sync 0;
+
+BB16_16:
+ setp.lt.u32 %p10, %r14, 128;
+ @%p10 bra BB16_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB16_19;
+
+ ld.shared.f64 %fd32, [%r13+512];
+ min.f64 %fd6, %fd6, %fd32;
+ st.shared.f64 [%r13], %fd6;
+
+BB16_19:
+ bar.sync 0;
+
+BB16_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB16_33;
+
+ setp.lt.u32 %p13, %r14, 64;
+ @%p13 bra BB16_23;
+
+ ld.volatile.shared.f64 %fd33, [%r13+256];
+ min.f64 %fd6, %fd6, %fd33;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB16_23:
+ setp.lt.u32 %p14, %r14, 32;
+ @%p14 bra BB16_25;
+
+ ld.volatile.shared.f64 %fd34, [%r13+128];
+ min.f64 %fd6, %fd6, %fd34;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB16_25:
+ setp.lt.u32 %p15, %r14, 16;
+ @%p15 bra BB16_27;
+
+ ld.volatile.shared.f64 %fd35, [%r13+64];
+ min.f64 %fd6, %fd6, %fd35;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB16_27:
+ setp.lt.u32 %p16, %r14, 8;
+ @%p16 bra BB16_29;
+
+ ld.volatile.shared.f64 %fd36, [%r13+32];
+ min.f64 %fd6, %fd6, %fd36;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB16_29:
+ setp.lt.u32 %p17, %r14, 4;
+ @%p17 bra BB16_31;
+
+ ld.volatile.shared.f64 %fd37, [%r13+16];
+ min.f64 %fd6, %fd6, %fd37;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB16_31:
+ setp.lt.u32 %p18, %r14, 2;
+ @%p18 bra BB16_33;
+
+ ld.volatile.shared.f64 %fd38, [%r13+8];
+ min.f64 %fd39, %fd6, %fd38;
+ st.volatile.shared.f64 [%r13], %fd39;
+
+BB16_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB16_35;
+
+ ld.shared.f64 %fd40, [memory];
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.u32 %rd7, %r6, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f64 [%rd8], %fd40;
+
+BB16_35:
+ ret;
+}
+
+ // .globl reduce_row_min_f
+.visible .entry reduce_row_min_f(
+ .param .u64 reduce_row_min_f_param_0,
+ .param .u64 reduce_row_min_f_param_1,
+ .param .u32 reduce_row_min_f_param_2,
+ .param .u32 reduce_row_min_f_param_3
+)
+{
+ .reg .pred %p<20>;
+ .reg .f32 %f<56>;
+ .reg .b32 %r<72>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [reduce_row_min_f_param_0];
+ ld.param.u64 %rd2, [reduce_row_min_f_param_1];
+ ld.param.u32 %r5, [reduce_row_min_f_param_2];
+ ld.param.u32 %r4, [reduce_row_min_f_param_3];
+ mov.u32 %r6, %ctaid.x;
+ setp.ge.u32 %p1, %r6, %r5;
+ @%p1 bra BB17_35;
+
+ mov.u32 %r71, %tid.x;
+ mov.f32 %f6, 0f7F7FFFFF;
+ setp.ge.u32 %p2, %r71, %r4;
+ @%p2 bra BB17_4;
+
+ cvta.to.global.u64 %rd3, %rd1;
+
+BB17_3:
+ mad.lo.s32 %r8, %r6, %r4, %r71;
+ mul.wide.u32 %rd4, %r8, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f32 %f28, [%rd5];
+ min.f32 %f6, %f6, %f28;
+ mov.u32 %r9, %ntid.x;
+ add.s32 %r71, %r9, %r71;
+ setp.lt.u32 %p3, %r71, %r4;
+ @%p3 bra BB17_3;
+
+BB17_4:
+ mov.u32 %r10, %tid.x;
+ shl.b32 %r11, %r10, 2;
+ mov.u32 %r12, memory;
+ add.s32 %r13, %r12, %r11;
+ st.shared.f32 [%r13], %f6;
+ bar.sync 0;
+ mov.u32 %r14, %ntid.x;
+ setp.lt.u32 %p4, %r14, 1024;
+ @%p4 bra BB17_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB17_7;
+
+ ld.shared.f32 %f29, [%r13+2048];
+ min.f32 %f6, %f6, %f29;
+ st.shared.f32 [%r13], %f6;
+
+BB17_7:
+ bar.sync 0;
+
+BB17_8:
+ setp.lt.u32 %p6, %r14, 512;
+ @%p6 bra BB17_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB17_11;
+
+ ld.shared.f32 %f30, [%r13+1024];
+ min.f32 %f6, %f6, %f30;
+ st.shared.f32 [%r13], %f6;
+
+BB17_11:
+ bar.sync 0;
+
+BB17_12:
+ setp.lt.u32 %p8, %r14, 256;
+ @%p8 bra BB17_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB17_15;
+
+ ld.shared.f32 %f31, [%r13+512];
+ min.f32 %f6, %f6, %f31;
+ st.shared.f32 [%r13], %f6;
+
+BB17_15:
+ bar.sync 0;
+
+BB17_16:
+ setp.lt.u32 %p10, %r14, 128;
+ @%p10 bra BB17_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB17_19;
+
+ ld.shared.f32 %f32, [%r13+256];
+ min.f32 %f6, %f6, %f32;
+ st.shared.f32 [%r13], %f6;
+
+BB17_19:
+ bar.sync 0;
+
+BB17_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB17_33;
+
+ setp.lt.u32 %p13, %r14, 64;
+ @%p13 bra BB17_23;
+
+ ld.volatile.shared.f32 %f33, [%r13+128];
+ min.f32 %f6, %f6, %f33;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB17_23:
+ setp.lt.u32 %p14, %r14, 32;
+ @%p14 bra BB17_25;
+
+ ld.volatile.shared.f32 %f34, [%r13+64];
+ min.f32 %f6, %f6, %f34;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB17_25:
+ setp.lt.u32 %p15, %r14, 16;
+ @%p15 bra BB17_27;
+
+ ld.volatile.shared.f32 %f35, [%r13+32];
+ min.f32 %f6, %f6, %f35;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB17_27:
+ setp.lt.u32 %p16, %r14, 8;
+ @%p16 bra BB17_29;
+
+ ld.volatile.shared.f32 %f36, [%r13+16];
+ min.f32 %f6, %f6, %f36;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB17_29:
+ setp.lt.u32 %p17, %r14, 4;
+ @%p17 bra BB17_31;
+
+ ld.volatile.shared.f32 %f37, [%r13+8];
+ min.f32 %f6, %f6, %f37;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB17_31:
+ setp.lt.u32 %p18, %r14, 2;
+ @%p18 bra BB17_33;
+
+ ld.volatile.shared.f32 %f38, [%r13+4];
+ min.f32 %f39, %f6, %f38;
+ st.volatile.shared.f32 [%r13], %f39;
+
+BB17_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB17_35;
+
+ ld.shared.f32 %f40, [memory];
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.u32 %rd7, %r6, 4;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f32 [%rd8], %f40;
+
+BB17_35:
+ ret;
+}
+
+ // .globl reduce_col_min_d
+.visible .entry reduce_col_min_d(
+ .param .u64 reduce_col_min_d_param_0,
+ .param .u64 reduce_col_min_d_param_1,
+ .param .u32 reduce_col_min_d_param_2,
+ .param .u32 reduce_col_min_d_param_3
+)
+{
+ .reg .pred %p<4>;
+ .reg .b32 %r<11>;
+ .reg .f64 %fd<9>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd2, [reduce_col_min_d_param_0];
+ ld.param.u64 %rd3, [reduce_col_min_d_param_1];
+ ld.param.u32 %r5, [reduce_col_min_d_param_2];
+ ld.param.u32 %r6, [reduce_col_min_d_param_3];
+ mov.u32 %r7, %ntid.x;
+ mov.u32 %r8, %ctaid.x;
+ mov.u32 %r9, %tid.x;
+ mad.lo.s32 %r1, %r7, %r8, %r9;
+ setp.ge.u32 %p1, %r1, %r6;
+ @%p1 bra BB18_5;
+
+ mul.lo.s32 %r2, %r6, %r5;
+ cvta.to.global.u64 %rd1, %rd2;
+ mov.f64 %fd8, 0d7FEFFFFFFFFFFFFF;
+ setp.ge.u32 %p2, %r1, %r2;
+ @%p2 bra BB18_4;
+
+ mov.u32 %r10, %r1;
+
+BB18_3:
+ mul.wide.u32 %rd4, %r10, 8;
+ add.s64 %rd5, %rd1, %rd4;
+ ld.global.f64 %fd6, [%rd5];
+ min.f64 %fd8, %fd8, %fd6;
+ add.s32 %r10, %r10, %r6;
+ setp.lt.u32 %p3, %r10, %r2;
+ @%p3 bra BB18_3;
+
+BB18_4:
+ cvta.to.global.u64 %rd6, %rd3;
+ mul.wide.u32 %rd7, %r1, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f64 [%rd8], %fd8;
+
+BB18_5:
+ ret;
+}
+
+ // .globl reduce_col_min_f
+.visible .entry reduce_col_min_f(
+ .param .u64 reduce_col_min_f_param_0,
+ .param .u64 reduce_col_min_f_param_1,
+ .param .u32 reduce_col_min_f_param_2,
+ .param .u32 reduce_col_min_f_param_3
+)
+{
+ .reg .pred %p<4>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<11>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd2, [reduce_col_min_f_param_0];
+ ld.param.u64 %rd3, [reduce_col_min_f_param_1];
+ ld.param.u32 %r5, [reduce_col_min_f_param_2];
+ ld.param.u32 %r6, [reduce_col_min_f_param_3];
+ mov.u32 %r7, %ntid.x;
+ mov.u32 %r8, %ctaid.x;
+ mov.u32 %r9, %tid.x;
+ mad.lo.s32 %r1, %r7, %r8, %r9;
+ setp.ge.u32 %p1, %r1, %r6;
+ @%p1 bra BB19_5;
+
+ mul.lo.s32 %r2, %r6, %r5;
+ cvta.to.global.u64 %rd1, %rd2;
+ mov.f32 %f8, 0f7F7FFFFF;
+ setp.ge.u32 %p2, %r1, %r2;
+ @%p2 bra BB19_4;
+
+ mov.u32 %r10, %r1;
+
+BB19_3:
+ mul.wide.u32 %rd4, %r10, 4;
+ add.s64 %rd5, %rd1, %rd4;
+ ld.global.f32 %f6, [%rd5];
+ min.f32 %f8, %f8, %f6;
+ add.s32 %r10, %r10, %r6;
+ setp.lt.u32 %p3, %r10, %r2;
+ @%p3 bra BB19_3;
+
+BB19_4:
+ cvta.to.global.u64 %rd6, %rd3;
+ mul.wide.u32 %rd7, %r1, 4;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f32 [%rd8], %f8;
+
+BB19_5:
+ ret;
+}
+
+ // .globl reduce_sum_sq_d
+.visible .entry reduce_sum_sq_d(
+ .param .u64 reduce_sum_sq_d_param_0,
+ .param .u64 reduce_sum_sq_d_param_1,
+ .param .u32 reduce_sum_sq_d_param_2
+)
+{
+ .local .align 16 .b8 __local_depot20[16];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<21>;
+ .reg .b32 %r<39>;
+ .reg .f64 %fd<61>;
+ .reg .b64 %rd<16>;
+
+
+ mov.u64 %SPL, __local_depot20;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd1, [reduce_sum_sq_d_param_0];
+ ld.param.u64 %rd2, [reduce_sum_sq_d_param_1];
+ ld.param.u32 %r6, [reduce_sum_sq_d_param_2];
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mov.u32 %r10, %tid.x;
+ mad.lo.s32 %r38, %r8, %r9, %r10;
+ mov.f64 %fd45, 0d0000000000000000;
+ setp.ge.u32 %p1, %r38, %r6;
+ @%p1 bra BB20_4;
+
+BB20_1:
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.u32 %rd4, %r38, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd30, [%rd5];
+ fma.rn.f64 %fd45, %fd30, %fd30, %fd45;
+ add.s32 %r3, %r38, %r9;
+ setp.ge.u32 %p2, %r3, %r6;
+ @%p2 bra BB20_3;
+
+ mul.wide.u32 %rd7, %r3, 8;
+ add.s64 %rd8, %rd3, %rd7;
+ ld.global.f64 %fd31, [%rd8];
+ fma.rn.f64 %fd45, %fd31, %fd31, %fd45;
+
+BB20_3:
+ shl.b32 %r13, %r9, 1;
+ mov.u32 %r14, %nctaid.x;
+ mad.lo.s32 %r38, %r13, %r14, %r38;
+ setp.lt.u32 %p3, %r38, %r6;
+ @%p3 bra BB20_1;
+
+BB20_4:
+ shl.b32 %r16, %r10, 3;
+ mov.u32 %r17, memory;
+ add.s32 %r5, %r17, %r16;
+ st.shared.f64 [%r5], %fd45;
+ bar.sync 0;
+ setp.lt.u32 %p4, %r9, 1024;
+ @%p4 bra BB20_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB20_7;
+
+ ld.shared.f64 %fd32, [%r5+4096];
+ fma.rn.f64 %fd45, %fd32, %fd32, %fd45;
+ st.shared.f64 [%r5], %fd45;
+
+BB20_7:
+ bar.sync 0;
+
+BB20_8:
+ setp.lt.u32 %p6, %r9, 512;
+ @%p6 bra BB20_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB20_11;
+
+ ld.shared.f64 %fd33, [%r5+2048];
+ fma.rn.f64 %fd45, %fd33, %fd33, %fd45;
+ st.shared.f64 [%r5], %fd45;
+
+BB20_11:
+ bar.sync 0;
+
+BB20_12:
+ setp.lt.u32 %p8, %r9, 256;
+ @%p8 bra BB20_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB20_15;
+
+ ld.shared.f64 %fd34, [%r5+1024];
+ fma.rn.f64 %fd45, %fd34, %fd34, %fd45;
+ st.shared.f64 [%r5], %fd45;
+
+BB20_15:
+ bar.sync 0;
+
+BB20_16:
+ setp.lt.u32 %p10, %r9, 128;
+ @%p10 bra BB20_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB20_19;
+
+ ld.shared.f64 %fd35, [%r5+512];
+ fma.rn.f64 %fd45, %fd35, %fd35, %fd45;
+ st.shared.f64 [%r5], %fd45;
+
+BB20_19:
+ bar.sync 0;
+
+BB20_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB20_33;
+
+ setp.lt.u32 %p13, %r9, 64;
+ @%p13 bra BB20_23;
+
+ ld.volatile.shared.f64 %fd36, [%r5+256];
+ fma.rn.f64 %fd45, %fd36, %fd36, %fd45;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB20_23:
+ setp.lt.u32 %p14, %r9, 32;
+ @%p14 bra BB20_25;
+
+ ld.volatile.shared.f64 %fd37, [%r5+128];
+ fma.rn.f64 %fd45, %fd37, %fd37, %fd45;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB20_25:
+ setp.lt.u32 %p15, %r9, 16;
+ @%p15 bra BB20_27;
+
+ ld.volatile.shared.f64 %fd38, [%r5+64];
+ fma.rn.f64 %fd45, %fd38, %fd38, %fd45;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB20_27:
+ setp.lt.u32 %p16, %r9, 8;
+ @%p16 bra BB20_29;
+
+ ld.volatile.shared.f64 %fd39, [%r5+32];
+ fma.rn.f64 %fd45, %fd39, %fd39, %fd45;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB20_29:
+ setp.lt.u32 %p17, %r9, 4;
+ @%p17 bra BB20_31;
+
+ ld.volatile.shared.f64 %fd40, [%r5+16];
+ fma.rn.f64 %fd45, %fd40, %fd40, %fd45;
+ st.volatile.shared.f64 [%r5], %fd45;
+
+BB20_31:
+ setp.lt.u32 %p18, %r9, 2;
+ @%p18 bra BB20_33;
+
+ ld.volatile.shared.f64 %fd41, [%r5+8];
+ fma.rn.f64 %fd42, %fd41, %fd41, %fd45;
+ st.volatile.shared.f64 [%r5], %fd42;
+
+BB20_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB20_37;
+
+ mov.u32 %r34, %nctaid.x;
+ setp.gt.u32 %p20, %r34, 9;
+ @%p20 bra BB20_36;
+
+ ld.shared.f64 %fd43, [memory];
+ add.u64 %rd9, %SP, 0;
+ add.u64 %rd10, %SPL, 0;
+ st.local.u32 [%rd10], %r7;
+ st.local.f64 [%rd10+8], %fd43;
+ mov.u64 %rd11, $str;
+ cvta.global.u64 %rd12, %rd11;
+ // Callseq Start 6
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd12;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd9;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r36, [retval0+0];
+
+ //{
+ }// Callseq End 6
+
+BB20_36:
+ ld.shared.f64 %fd44, [memory];
+ cvta.to.global.u64 %rd13, %rd2;
+ mul.wide.u32 %rd14, %r7, 8;
+ add.s64 %rd15, %rd13, %rd14;
+ st.global.f64 [%rd15], %fd44;
+
+BB20_37:
+ ret;
+}
+
+ // .globl reduce_sum_sq_f
+.visible .entry reduce_sum_sq_f(
+ .param .u64 reduce_sum_sq_f_param_0,
+ .param .u64 reduce_sum_sq_f_param_1,
+ .param .u32 reduce_sum_sq_f_param_2
+)
+{
+ .local .align 16 .b8 __local_depot21[16];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
+ .reg .pred %p<21>;
+ .reg .f32 %f<61>;
+ .reg .b32 %r<39>;
+ .reg .f64 %fd<2>;
+ .reg .b64 %rd<16>;
+
+
+ mov.u64 %SPL, __local_depot21;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd1, [reduce_sum_sq_f_param_0];
+ ld.param.u64 %rd2, [reduce_sum_sq_f_param_1];
+ ld.param.u32 %r6, [reduce_sum_sq_f_param_2];
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mov.u32 %r10, %tid.x;
+ mad.lo.s32 %r38, %r8, %r9, %r10;
+ mov.f32 %f45, 0f00000000;
+ setp.ge.u32 %p1, %r38, %r6;
+ @%p1 bra BB21_4;
+
+BB21_1:
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.u32 %rd4, %r38, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f32 %f30, [%rd5];
+ fma.rn.f32 %f45, %f30, %f30, %f45;
+ add.s32 %r3, %r38, %r9;
+ setp.ge.u32 %p2, %r3, %r6;
+ @%p2 bra BB21_3;
+
+ mul.wide.u32 %rd7, %r3, 4;
+ add.s64 %rd8, %rd3, %rd7;
+ ld.global.f32 %f31, [%rd8];
+ fma.rn.f32 %f45, %f31, %f31, %f45;
+
+BB21_3:
+ shl.b32 %r13, %r9, 1;
+ mov.u32 %r14, %nctaid.x;
+ mad.lo.s32 %r38, %r13, %r14, %r38;
+ setp.lt.u32 %p3, %r38, %r6;
+ @%p3 bra BB21_1;
+
+BB21_4:
+ shl.b32 %r16, %r10, 2;
+ mov.u32 %r17, memory;
+ add.s32 %r5, %r17, %r16;
+ st.shared.f32 [%r5], %f45;
+ bar.sync 0;
+ setp.lt.u32 %p4, %r9, 1024;
+ @%p4 bra BB21_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB21_7;
+
+ ld.shared.f32 %f32, [%r5+2048];
+ fma.rn.f32 %f45, %f32, %f32, %f45;
+ st.shared.f32 [%r5], %f45;
+
+BB21_7:
+ bar.sync 0;
+
+BB21_8:
+ setp.lt.u32 %p6, %r9, 512;
+ @%p6 bra BB21_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB21_11;
+
+ ld.shared.f32 %f33, [%r5+1024];
+ fma.rn.f32 %f45, %f33, %f33, %f45;
+ st.shared.f32 [%r5], %f45;
+
+BB21_11:
+ bar.sync 0;
+
+BB21_12:
+ setp.lt.u32 %p8, %r9, 256;
+ @%p8 bra BB21_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB21_15;
+
+ ld.shared.f32 %f34, [%r5+512];
+ fma.rn.f32 %f45, %f34, %f34, %f45;
+ st.shared.f32 [%r5], %f45;
+
+BB21_15:
+ bar.sync 0;
+
+BB21_16:
+ setp.lt.u32 %p10, %r9, 128;
+ @%p10 bra BB21_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB21_19;
+
+ ld.shared.f32 %f35, [%r5+256];
+ fma.rn.f32 %f45, %f35, %f35, %f45;
+ st.shared.f32 [%r5], %f45;
+
+BB21_19:
+ bar.sync 0;
+
+BB21_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB21_33;
+
+ setp.lt.u32 %p13, %r9, 64;
+ @%p13 bra BB21_23;
+
+ ld.volatile.shared.f32 %f36, [%r5+128];
+ fma.rn.f32 %f45, %f36, %f36, %f45;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB21_23:
+ setp.lt.u32 %p14, %r9, 32;
+ @%p14 bra BB21_25;
+
+ ld.volatile.shared.f32 %f37, [%r5+64];
+ fma.rn.f32 %f45, %f37, %f37, %f45;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB21_25:
+ setp.lt.u32 %p15, %r9, 16;
+ @%p15 bra BB21_27;
+
+ ld.volatile.shared.f32 %f38, [%r5+32];
+ fma.rn.f32 %f45, %f38, %f38, %f45;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB21_27:
+ setp.lt.u32 %p16, %r9, 8;
+ @%p16 bra BB21_29;
+
+ ld.volatile.shared.f32 %f39, [%r5+16];
+ fma.rn.f32 %f45, %f39, %f39, %f45;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB21_29:
+ setp.lt.u32 %p17, %r9, 4;
+ @%p17 bra BB21_31;
+
+ ld.volatile.shared.f32 %f40, [%r5+8];
+ fma.rn.f32 %f45, %f40, %f40, %f45;
+ st.volatile.shared.f32 [%r5], %f45;
+
+BB21_31:
+ setp.lt.u32 %p18, %r9, 2;
+ @%p18 bra BB21_33;
+
+ ld.volatile.shared.f32 %f41, [%r5+4];
+ fma.rn.f32 %f42, %f41, %f41, %f45;
+ st.volatile.shared.f32 [%r5], %f42;
+
+BB21_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB21_37;
+
+ mov.u32 %r34, %nctaid.x;
+ setp.gt.u32 %p20, %r34, 9;
+ @%p20 bra BB21_36;
+
+ ld.shared.f32 %f43, [memory];
+ cvt.f64.f32 %fd1, %f43;
+ add.u64 %rd9, %SP, 0;
+ add.u64 %rd10, %SPL, 0;
+ st.local.u32 [%rd10], %r7;
+ st.local.f64 [%rd10+8], %fd1;
+ mov.u64 %rd11, $str;
+ cvta.global.u64 %rd12, %rd11;
+ // Callseq Start 7
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd12;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd9;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r36, [retval0+0];
+
+ //{
+ }// Callseq End 7
+
+BB21_36:
+ ld.shared.f32 %f44, [memory];
+ cvta.to.global.u64 %rd13, %rd2;
+ mul.wide.u32 %rd14, %r7, 4;
+ add.s64 %rd15, %rd13, %rd14;
+ st.global.f32 [%rd15], %f44;
+
+BB21_37:
+ ret;
+}
+
+ // .globl reduce_col_sum_sq_d
+.visible .entry reduce_col_sum_sq_d(
+ .param .u64 reduce_col_sum_sq_d_param_0,
+ .param .u64 reduce_col_sum_sq_d_param_1,
+ .param .u32 reduce_col_sum_sq_d_param_2,
+ .param .u32 reduce_col_sum_sq_d_param_3
+)
+{
+ .reg .pred %p<4>;
+ .reg .b32 %r<11>;
+ .reg .f64 %fd<9>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd2, [reduce_col_sum_sq_d_param_0];
+ ld.param.u64 %rd3, [reduce_col_sum_sq_d_param_1];
+ ld.param.u32 %r5, [reduce_col_sum_sq_d_param_2];
+ ld.param.u32 %r6, [reduce_col_sum_sq_d_param_3];
+ mov.u32 %r7, %ntid.x;
+ mov.u32 %r8, %ctaid.x;
+ mov.u32 %r9, %tid.x;
+ mad.lo.s32 %r1, %r7, %r8, %r9;
+ setp.ge.u32 %p1, %r1, %r6;
+ @%p1 bra BB22_5;
+
+ mul.lo.s32 %r2, %r6, %r5;
+ cvta.to.global.u64 %rd1, %rd2;
+ mov.f64 %fd8, 0d0000000000000000;
+ setp.ge.u32 %p2, %r1, %r2;
+ @%p2 bra BB22_4;
+
+ mov.u32 %r10, %r1;
+
+BB22_3:
+ mul.wide.u32 %rd4, %r10, 8;
+ add.s64 %rd5, %rd1, %rd4;
+ ld.global.f64 %fd6, [%rd5];
+ fma.rn.f64 %fd8, %fd6, %fd6, %fd8;
+ add.s32 %r10, %r10, %r6;
+ setp.lt.u32 %p3, %r10, %r2;
+ @%p3 bra BB22_3;
+
+BB22_4:
+ cvta.to.global.u64 %rd6, %rd3;
+ mul.wide.u32 %rd7, %r1, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f64 [%rd8], %fd8;
+
+BB22_5:
+ ret;
+}
+
+ // .globl reduce_col_sum_sq_f
+.visible .entry reduce_col_sum_sq_f(
+ .param .u64 reduce_col_sum_sq_f_param_0,
+ .param .u64 reduce_col_sum_sq_f_param_1,
+ .param .u32 reduce_col_sum_sq_f_param_2,
+ .param .u32 reduce_col_sum_sq_f_param_3
+)
+{
+ .reg .pred %p<4>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<11>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd2, [reduce_col_sum_sq_f_param_0];
+ ld.param.u64 %rd3, [reduce_col_sum_sq_f_param_1];
+ ld.param.u32 %r5, [reduce_col_sum_sq_f_param_2];
+ ld.param.u32 %r6, [reduce_col_sum_sq_f_param_3];
+ mov.u32 %r7, %ntid.x;
+ mov.u32 %r8, %ctaid.x;
+ mov.u32 %r9, %tid.x;
+ mad.lo.s32 %r1, %r7, %r8, %r9;
+ setp.ge.u32 %p1, %r1, %r6;
+ @%p1 bra BB23_5;
+
+ mul.lo.s32 %r2, %r6, %r5;
+ cvta.to.global.u64 %rd1, %rd2;
+ mov.f32 %f8, 0f00000000;
+ setp.ge.u32 %p2, %r1, %r2;
+ @%p2 bra BB23_4;
+
+ mov.u32 %r10, %r1;
+
+BB23_3:
+ mul.wide.u32 %rd4, %r10, 4;
+ add.s64 %rd5, %rd1, %rd4;
+ ld.global.f32 %f6, [%rd5];
+ fma.rn.f32 %f8, %f6, %f6, %f8;
+ add.s32 %r10, %r10, %r6;
+ setp.lt.u32 %p3, %r10, %r2;
+ @%p3 bra BB23_3;
+
+BB23_4:
+ cvta.to.global.u64 %rd6, %rd3;
+ mul.wide.u32 %rd7, %r1, 4;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f32 [%rd8], %f8;
+
+BB23_5:
+ ret;
+}
+
+ // .globl reduce_row_sum_sq_d
+.visible .entry reduce_row_sum_sq_d(
+ .param .u64 reduce_row_sum_sq_d_param_0,
+ .param .u64 reduce_row_sum_sq_d_param_1,
+ .param .u32 reduce_row_sum_sq_d_param_2,
+ .param .u32 reduce_row_sum_sq_d_param_3
+)
+{
+ .reg .pred %p<20>;
+ .reg .b32 %r<72>;
+ .reg .f64 %fd<56>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [reduce_row_sum_sq_d_param_0];
+ ld.param.u64 %rd2, [reduce_row_sum_sq_d_param_1];
+ ld.param.u32 %r5, [reduce_row_sum_sq_d_param_2];
+ ld.param.u32 %r4, [reduce_row_sum_sq_d_param_3];
+ mov.u32 %r6, %ctaid.x;
+ setp.ge.u32 %p1, %r6, %r5;
+ @%p1 bra BB24_35;
+
+ mov.u32 %r71, %tid.x;
+ mov.f64 %fd6, 0d0000000000000000;
+ setp.ge.u32 %p2, %r71, %r4;
+ @%p2 bra BB24_4;
+
+ cvta.to.global.u64 %rd3, %rd1;
+
+BB24_3:
+ mad.lo.s32 %r8, %r6, %r4, %r71;
+ mul.wide.u32 %rd4, %r8, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd28, [%rd5];
+ fma.rn.f64 %fd6, %fd28, %fd28, %fd6;
+ mov.u32 %r9, %ntid.x;
+ add.s32 %r71, %r9, %r71;
+ setp.lt.u32 %p3, %r71, %r4;
+ @%p3 bra BB24_3;
+
+BB24_4:
+ mov.u32 %r10, %tid.x;
+ shl.b32 %r11, %r10, 3;
+ mov.u32 %r12, memory;
+ add.s32 %r13, %r12, %r11;
+ st.shared.f64 [%r13], %fd6;
+ bar.sync 0;
+ mov.u32 %r14, %ntid.x;
+ setp.lt.u32 %p4, %r14, 1024;
+ @%p4 bra BB24_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB24_7;
+
+ ld.shared.f64 %fd29, [%r13+4096];
+ fma.rn.f64 %fd6, %fd29, %fd29, %fd6;
+ st.shared.f64 [%r13], %fd6;
+
+BB24_7:
+ bar.sync 0;
+
+BB24_8:
+ setp.lt.u32 %p6, %r14, 512;
+ @%p6 bra BB24_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB24_11;
+
+ ld.shared.f64 %fd30, [%r13+2048];
+ fma.rn.f64 %fd6, %fd30, %fd30, %fd6;
+ st.shared.f64 [%r13], %fd6;
+
+BB24_11:
+ bar.sync 0;
+
+BB24_12:
+ setp.lt.u32 %p8, %r14, 256;
+ @%p8 bra BB24_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB24_15;
+
+ ld.shared.f64 %fd31, [%r13+1024];
+ fma.rn.f64 %fd6, %fd31, %fd31, %fd6;
+ st.shared.f64 [%r13], %fd6;
+
+BB24_15:
+ bar.sync 0;
+
+BB24_16:
+ setp.lt.u32 %p10, %r14, 128;
+ @%p10 bra BB24_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB24_19;
+
+ ld.shared.f64 %fd32, [%r13+512];
+ fma.rn.f64 %fd6, %fd32, %fd32, %fd6;
+ st.shared.f64 [%r13], %fd6;
+
+BB24_19:
+ bar.sync 0;
+
+BB24_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB24_33;
+
+ setp.lt.u32 %p13, %r14, 64;
+ @%p13 bra BB24_23;
+
+ ld.volatile.shared.f64 %fd33, [%r13+256];
+ fma.rn.f64 %fd6, %fd33, %fd33, %fd6;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB24_23:
+ setp.lt.u32 %p14, %r14, 32;
+ @%p14 bra BB24_25;
+
+ ld.volatile.shared.f64 %fd34, [%r13+128];
+ fma.rn.f64 %fd6, %fd34, %fd34, %fd6;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB24_25:
+ setp.lt.u32 %p15, %r14, 16;
+ @%p15 bra BB24_27;
+
+ ld.volatile.shared.f64 %fd35, [%r13+64];
+ fma.rn.f64 %fd6, %fd35, %fd35, %fd6;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB24_27:
+ setp.lt.u32 %p16, %r14, 8;
+ @%p16 bra BB24_29;
+
+ ld.volatile.shared.f64 %fd36, [%r13+32];
+ fma.rn.f64 %fd6, %fd36, %fd36, %fd6;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB24_29:
+ setp.lt.u32 %p17, %r14, 4;
+ @%p17 bra BB24_31;
+
+ ld.volatile.shared.f64 %fd37, [%r13+16];
+ fma.rn.f64 %fd6, %fd37, %fd37, %fd6;
+ st.volatile.shared.f64 [%r13], %fd6;
+
+BB24_31:
+ setp.lt.u32 %p18, %r14, 2;
+ @%p18 bra BB24_33;
+
+ ld.volatile.shared.f64 %fd38, [%r13+8];
+ fma.rn.f64 %fd39, %fd38, %fd38, %fd6;
+ st.volatile.shared.f64 [%r13], %fd39;
+
+BB24_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB24_35;
+
+ ld.shared.f64 %fd40, [memory];
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.u32 %rd7, %r6, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f64 [%rd8], %fd40;
+
+BB24_35:
+ ret;
+}
+
+ // .globl reduce_row_sum_sq_f
+.visible .entry reduce_row_sum_sq_f(
+ .param .u64 reduce_row_sum_sq_f_param_0,
+ .param .u64 reduce_row_sum_sq_f_param_1,
+ .param .u32 reduce_row_sum_sq_f_param_2,
+ .param .u32 reduce_row_sum_sq_f_param_3
+)
+{
+ .reg .pred %p<20>;
+ .reg .f32 %f<56>;
+ .reg .b32 %r<72>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [reduce_row_sum_sq_f_param_0];
+ ld.param.u64 %rd2, [reduce_row_sum_sq_f_param_1];
+ ld.param.u32 %r5, [reduce_row_sum_sq_f_param_2];
+ ld.param.u32 %r4, [reduce_row_sum_sq_f_param_3];
+ mov.u32 %r6, %ctaid.x;
+ setp.ge.u32 %p1, %r6, %r5;
+ @%p1 bra BB25_35;
+
+ mov.u32 %r71, %tid.x;
+ mov.f32 %f6, 0f00000000;
+ setp.ge.u32 %p2, %r71, %r4;
+ @%p2 bra BB25_4;
+
+ cvta.to.global.u64 %rd3, %rd1;
+
+BB25_3:
+ mad.lo.s32 %r8, %r6, %r4, %r71;
+ mul.wide.u32 %rd4, %r8, 4;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f32 %f28, [%rd5];
+ fma.rn.f32 %f6, %f28, %f28, %f6;
+ mov.u32 %r9, %ntid.x;
+ add.s32 %r71, %r9, %r71;
+ setp.lt.u32 %p3, %r71, %r4;
+ @%p3 bra BB25_3;
+
+BB25_4:
+ mov.u32 %r10, %tid.x;
+ shl.b32 %r11, %r10, 2;
+ mov.u32 %r12, memory;
+ add.s32 %r13, %r12, %r11;
+ st.shared.f32 [%r13], %f6;
+ bar.sync 0;
+ mov.u32 %r14, %ntid.x;
+ setp.lt.u32 %p4, %r14, 1024;
+ @%p4 bra BB25_8;
+
+ setp.gt.u32 %p5, %r10, 511;
+ @%p5 bra BB25_7;
+
+ ld.shared.f32 %f29, [%r13+2048];
+ fma.rn.f32 %f6, %f29, %f29, %f6;
+ st.shared.f32 [%r13], %f6;
+
+BB25_7:
+ bar.sync 0;
+
+BB25_8:
+ setp.lt.u32 %p6, %r14, 512;
+ @%p6 bra BB25_12;
+
+ setp.gt.u32 %p7, %r10, 255;
+ @%p7 bra BB25_11;
+
+ ld.shared.f32 %f30, [%r13+1024];
+ fma.rn.f32 %f6, %f30, %f30, %f6;
+ st.shared.f32 [%r13], %f6;
+
+BB25_11:
+ bar.sync 0;
+
+BB25_12:
+ setp.lt.u32 %p8, %r14, 256;
+ @%p8 bra BB25_16;
+
+ setp.gt.u32 %p9, %r10, 127;
+ @%p9 bra BB25_15;
+
+ ld.shared.f32 %f31, [%r13+512];
+ fma.rn.f32 %f6, %f31, %f31, %f6;
+ st.shared.f32 [%r13], %f6;
+
+BB25_15:
+ bar.sync 0;
+
+BB25_16:
+ setp.lt.u32 %p10, %r14, 128;
+ @%p10 bra BB25_20;
+
+ setp.gt.u32 %p11, %r10, 63;
+ @%p11 bra BB25_19;
+
+ ld.shared.f32 %f32, [%r13+256];
+ fma.rn.f32 %f6, %f32, %f32, %f6;
+ st.shared.f32 [%r13], %f6;
+
+BB25_19:
+ bar.sync 0;
+
+BB25_20:
+ setp.gt.u32 %p12, %r10, 31;
+ @%p12 bra BB25_33;
+
+ setp.lt.u32 %p13, %r14, 64;
+ @%p13 bra BB25_23;
+
+ ld.volatile.shared.f32 %f33, [%r13+128];
+ fma.rn.f32 %f6, %f33, %f33, %f6;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB25_23:
+ setp.lt.u32 %p14, %r14, 32;
+ @%p14 bra BB25_25;
+
+ ld.volatile.shared.f32 %f34, [%r13+64];
+ fma.rn.f32 %f6, %f34, %f34, %f6;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB25_25:
+ setp.lt.u32 %p15, %r14, 16;
+ @%p15 bra BB25_27;
+
+ ld.volatile.shared.f32 %f35, [%r13+32];
+ fma.rn.f32 %f6, %f35, %f35, %f6;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB25_27:
+ setp.lt.u32 %p16, %r14, 8;
+ @%p16 bra BB25_29;
+
+ ld.volatile.shared.f32 %f36, [%r13+16];
+ fma.rn.f32 %f6, %f36, %f36, %f6;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB25_29:
+ setp.lt.u32 %p17, %r14, 4;
+ @%p17 bra BB25_31;
+
+ ld.volatile.shared.f32 %f37, [%r13+8];
+ fma.rn.f32 %f6, %f37, %f37, %f6;
+ st.volatile.shared.f32 [%r13], %f6;
+
+BB25_31:
+ setp.lt.u32 %p18, %r14, 2;
+ @%p18 bra BB25_33;
+
+ ld.volatile.shared.f32 %f38, [%r13+4];
+ fma.rn.f32 %f39, %f38, %f38, %f6;
+ st.volatile.shared.f32 [%r13], %f39;
+
+BB25_33:
+ setp.ne.s32 %p19, %r10, 0;
+ @%p19 bra BB25_35;
+
+ ld.shared.f32 %f40, [memory];
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.u32 %rd7, %r6, 4;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f32 [%rd8], %f40;
+
+BB25_35:
+ ret;
+}
+
+
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
new file mode 100644
index 0000000..36299c7
--- /dev/null
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "SpoofCUDAContext.h"
+
+#include <filesystem>
+#include <iostream>
+#include <cstdlib>
+#include <sstream>
+
+size_t SpoofCUDAContext::initialize_cuda(uint32_t device_id, const char* resource_path) {
+
+#ifdef __DEBUG
+ std::cout << "initializing cuda device " << device_id << std::endl;
+#endif
+
+ SpoofCUDAContext *ctx = new SpoofCUDAContext(resource_path);
+ // cuda device is handled by jCuda atm
+ //cudaSetDevice(device_id);
+ //cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+ //cudaDeviceSynchronize();
+
+ CHECK_CUDA(cuModuleLoad(&(ctx->reductions), std::string(ctx->resource_path + std::string("/cuda/kernels/reduction.ptx")).c_str()));
+
+ CUfunction func;
+
+ // ToDo: implement a more scalable solution for these imports
+
+ // SUM
+ CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_d"));
+ ctx->reduction_kernels.insert(std::make_pair("reduce_sum_d", func));
+ CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_f"));
+ ctx->reduction_kernels.insert(std::make_pair("reduce_sum_f", func));
+
+ // SUM_SQ
+ CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_sq_d"));
+ ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_d", func));
+ CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_sq_f"));
+ ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_f", func));
+
+ // MIN
+ CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_min_d"));
+ ctx->reduction_kernels.insert(std::make_pair("reduce_min_d", func));
+ CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_min_f"));
+ ctx->reduction_kernels.insert(std::make_pair("reduce_min_f", func));
+
+ // MAX
+ CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_d"));
+ ctx->reduction_kernels.insert(std::make_pair("reduce_max_d", func));
+ CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_f"));
+ ctx->reduction_kernels.insert(std::make_pair("reduce_max_f", func));
+
+ return reinterpret_cast<size_t>(ctx);
+}
+
+void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id) {
+ delete ctx;
+ ctx = nullptr;
+ // cuda device is handled by jCuda atm
+ //cudaDeviceReset();
+}
+
+bool SpoofCUDAContext::compile_cuda(const std::string &src,
+ const std::string &name) {
+ std::string cuda_include_path("");
+ char* cdp = std::getenv("CUDA_PATH");
+ if(cdp != nullptr)
+ cuda_include_path = std::string("-I") + std::string(cdp) + "/include";
+ else {
+ std::cout << "Warning: CUDA_PATH environment variable not set. Using default include path"
+ "/usr/local/cuda/include" << std::endl;
+ cuda_include_path = std::string("-I/usr/local/cuda/include");
+ }
+
+#ifdef __DEBUG
+ std::cout << "compiling cuda kernel " << name << std::endl;
+ std::cout << src << std::endl;
+ std::cout << "cwd: " << std::filesystem::current_path() << std::endl;
+ std::cout << "cuda_path: " << cuda_include_path << std::endl;
+#endif
+
+ SpoofOperator::AggType type = SpoofOperator::AggType::NONE;
+ SpoofOperator::AggOp op = SpoofOperator::AggOp::NONE;
+
+ auto pos = 0;
+ if((pos = src.find("CellType")) != std::string::npos) {
+ if(src.substr(pos, pos+30).find("FULL_AGG") != std::string::npos)
+ type = SpoofOperator::AggType::FULL_AGG;
+ else if(src.substr(pos, pos+30).find("ROW_AGG") != std::string::npos)
+ type = SpoofOperator::AggType::ROW_AGG;
+ else if(src.substr(pos, pos+30).find("COL_AGG") != std::string::npos)
+ type = SpoofOperator::AggType::COL_AGG;
+ else if(src.substr(pos, pos+30).find("NO_AGG") != std::string::npos)
+ type = SpoofOperator::AggType::NO_AGG;
+ else {
+ std::cerr << "error: unknown aggregation type" << std::endl;
+ return false;
+ }
+
+ if(type != SpoofOperator::AggType::NO_AGG) {
+ if((pos = src.find("AggOp")) != std::string::npos) {
+ if(src.substr(pos, pos+30).find("AggOp.SUM") != std::string::npos)
+ op = SpoofOperator::AggOp::SUM;
+ else if(src.substr(pos, pos+30).find("AggOp.SUM_SQ") != std::string::npos)
+ op = SpoofOperator::AggOp::SUM_SQ;
+ else if(src.substr(pos, pos+30).find("AggOp.MIN") != std::string::npos)
+ op = SpoofOperator::AggOp::MIN;
+ else if(src.substr(pos, pos+30).find("AggOp.MAX") != std::string::npos)
+ op = SpoofOperator::AggOp::MAX;
+ else {
+ std::cerr << "error: unknown aggregation operator" << std::endl;
+ return false;
+ }
+ }
+ }
+ }
+
+ std::stringstream s1, s2, s3;
+ s1 << "-I" << resource_path << "/cuda/headers";
+ s2 << "-I" << resource_path << "/cuda/spoof";
+
+ jitify::Program program = kernel_cache.program(src, 0, {s1.str(), s2.str(), cuda_include_path});
+ ops.insert(std::make_pair(name, SpoofOperator({std::move(program), type, op})));
+ return true;
+}
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
new file mode 100644
index 0000000..36d29ec
--- /dev/null
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+#ifndef SPOOFCUDACONTEXT_H
+#define SPOOFCUDACONTEXT_H
+
+#include <cmath>
+#include <cstdint>
+#include <map>
+#include <string>
+
+#ifdef __DEBUG
+ #define JITIFY_PRINT_ALL 1
+#endif
+
+#include <jitify.hpp>
+
+#include "host_utils.h"
+
+using jitify::reflection::type_of;
+
+struct SpoofOperator {
+ enum class AggType : int { NO_AGG, ROW_AGG, COL_AGG, FULL_AGG, NONE };
+ enum class AggOp : int {SUM, SUM_SQ, MIN, MAX, NONE };
+
+ jitify::Program program;
+ AggType agg_type;
+ AggOp agg_op;
+
+};
+
+class SpoofCUDAContext {
+
+ jitify::JitCache kernel_cache;
+ std::map<const std::string, SpoofOperator> ops;
+ CUmodule reductions;
+ std::map<const std::string, CUfunction> reduction_kernels;
+
+public:
+ // ToDo: make launch config more adaptive
+ // num threads
+ const int NT = 256;
+
+ // values / thread
+ const int VT = 4;
+
+ const std::string resource_path;
+
+ SpoofCUDAContext(const char* resource_path_) : reductions(nullptr), resource_path(resource_path_) {}
+
+ static size_t initialize_cuda(uint32_t device_id, const char* resource_path_);
+
+ static void destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id);
+
+ bool compile_cuda(const std::string &src, const std::string &name);
+
+ template <typename T>
+ T execute_kernel(const std::string &name, T **in_ptrs, int num_inputs,
+ T **side_ptrs, int num_sides, T *out_ptr, T *scalars_ptr,
+ int num_scalars, int m, int n, int grix) {
+
+ T result = 0.0;
+ size_t dev_buf_size;
+ T **d_sides = nullptr;
+ T *d_scalars = nullptr;
+ T *d_temp_agg_buf;
+ uint32_t N = m * n;
+
+ auto o = ops.find(name);
+ if (o != ops.end()) {
+ SpoofOperator *op = &(o->second);
+
+ if (num_sides > 0) {
+ dev_buf_size = sizeof(T *) * num_sides;
+ CHECK_CUDART(cudaMalloc((void **)&d_sides, dev_buf_size));
+ CHECK_CUDART(cudaMemcpy(d_sides, side_ptrs, dev_buf_size, cudaMemcpyHostToDevice));
+ }
+
+ if (num_scalars > 0) {
+ dev_buf_size = sizeof(T) * num_scalars;
+ CHECK_CUDART(cudaMalloc((void **)&d_scalars, dev_buf_size));
+ CHECK_CUDART(cudaMemcpy(d_scalars, scalars_ptr, dev_buf_size, cudaMemcpyHostToDevice));
+ }
+
+ switch (op->agg_type) {
+ case SpoofOperator::AggType::FULL_AGG: {
+ // num ctas
+ int NB = std::ceil((N + NT * 2 - 1) / (NT * 2));
+ dim3 grid(NB, 1, 1);
+ dim3 block(NT, 1, 1);
+ unsigned int shared_mem_size = NT * sizeof(T);
+
+ dev_buf_size = sizeof(T) * NB;
+ CHECK_CUDART(cudaMalloc((void **)&d_temp_agg_buf, dev_buf_size));
+#ifdef __DEBUG
+ // ToDo: connect output to SystemDS logging facilities
+ std::cout << "launching spoof cellwise kernel " << name << " with "
+ << NT * NB << " threads in " << NB << " blocks and "
+ << shared_mem_size
+ << " bytes of shared memory for full aggregation of "
+ << N << " elements"
+ << std::endl;
+#endif
+ CHECK_CUDA(op->program.kernel(name)
+ .instantiate(type_of(result))
+ .configure(grid, block, shared_mem_size)
+ .launch(in_ptrs[0], d_sides, d_temp_agg_buf, d_scalars, m, n, grix));
+
+ if(NB > 1) {
+ std::string reduction_kernel_name = determine_agg_kernel<T>(op);
+
+ CUfunction reduce_kernel = reduction_kernels.find(reduction_kernel_name)->second;
+ N = NB;
+ int iter = 1;
+ while (NB > 1) {
+ void* args[3] = { &d_temp_agg_buf, &d_temp_agg_buf, &N};
+
+ NB = std::ceil((N + NT * 2 - 1) / (NT * 2));
+#ifdef __DEBUG
+ std::cout << "agg iter " << iter++ << " launching spoof cellwise kernel " << name << " with "
+ << NT * NB << " threads in " << NB << " blocks and "
+ << shared_mem_size
+ << " bytes of shared memory for full aggregation of "
+ << N << " elements"
+ << std::endl;
+#endif
+ CHECK_CUDA(cuLaunchKernel(reduce_kernel,
+ NB, 1, 1,
+ NT, 1, 1,
+ shared_mem_size, 0, args, 0));
+ N = NB;
+ }
+ }
+
+ CHECK_CUDART(cudaMemcpy(&result, d_temp_agg_buf, sizeof(T), cudaMemcpyDeviceToHost));
+ CHECK_CUDART(cudaFree(d_temp_agg_buf));
+ break;
+ }
+ case SpoofOperator::AggType::COL_AGG: {
+ // num ctas
+ int NB = std::ceil((N + NT - 1) / NT);
+ dim3 grid(NB, 1, 1);
+ dim3 block(NT, 1, 1);
+ unsigned int shared_mem_size = 0;
+#ifdef __DEBUG
+ std::cout << " launching spoof cellwise kernel " << name << " with "
+ << NT * NB << " threads in " << NB << " blocks for column aggregation of "
+ << N << " elements" << std::endl;
+#endif
+ CHECK_CUDA(op->program.kernel(name)
+ .instantiate(type_of(result))
+ .configure(grid, block)
+ .launch(in_ptrs[0], d_sides, out_ptr, d_scalars, m, n, grix));
+
+ break;
+ }
+ case SpoofOperator::AggType::ROW_AGG: {
+ // num ctas
+ int NB = m;
+ dim3 grid(NB, 1, 1);
+ dim3 block(NT, 1, 1);
+ unsigned int shared_mem_size = NT * sizeof(T);
+
+#ifdef __DEBUG
+ std::cout << " launching spoof cellwise kernel " << name << " with "
+ << NT * NB << " threads in " << NB << " blocks and "
+ << shared_mem_size << " bytes of shared memory for row aggregation of "
+ << N << " elements" << std::endl;
+#endif
+ CHECK_CUDA(op->program.kernel(name)
+ .instantiate(type_of(result))
+ .configure(grid, block, shared_mem_size)
+ .launch(in_ptrs[0], d_sides, out_ptr, d_scalars, m, n, grix));
+
+ break;
+ }
+ case SpoofOperator::AggType::NO_AGG:
+ default: {
+ // num ctas
+ // ToDo: VT not a template parameter anymore
+ int NB = std::ceil((N + NT * VT - 1) / (NT * VT));
+ dim3 grid(NB, 1, 1);
+ dim3 block(NT, 1, 1);
+#ifdef __DEBUG
+ std::cout << "launching spoof cellwise kernel " << name << " with " << NT * NB
+ << " threads in " << NB << " blocks without aggregation for "
+ << N << " elements"
+ << std::endl;
+#endif
+ CHECK_CUDA(op->program.kernel(name)
+ .instantiate(type_of(result))
+ .configure(grid, block)
+ .launch(in_ptrs[0], d_sides, out_ptr, d_scalars, m, n, grix));
+ }
+ }
+
+ if (num_scalars > 0)
+ CHECK_CUDART(cudaFree(d_scalars));
+
+ if (num_sides > 0)
+ CHECK_CUDART(cudaFree(d_sides));
+ }
+ else {
+ std::cerr << "kernel " << name << " not found." << std::endl;
+ return result;
+ }
+ return result;
+ }
+
+ template<typename T>
+ std::string determine_agg_kernel(SpoofOperator* op) {
+ std::string reduction_kernel_name;
+ std::string reduction_type;
+ std::string suffix = (typeid(T) == typeid(double) ? "_d" : "_f");
+ switch (op->agg_type) {
+ case SpoofOperator::AggType::FULL_AGG:
+ reduction_type = "_";
+ break;
+ case SpoofOperator::AggType::ROW_AGG:
+ reduction_type = "_row_";
+ break;
+ case SpoofOperator::AggType::COL_AGG:
+ reduction_type = "_col_";
+ break;
+ default:
+ std::cerr << "unknown reduction type" << std::endl;
+ return "";
+ }
+
+ switch (op->agg_op) {
+ case SpoofOperator::AggOp::MIN:
+ reduction_kernel_name = "reduce" + reduction_type + "min" + suffix;
+ break;
+ case SpoofOperator::AggOp::MAX:
+ reduction_kernel_name = "reduce" + reduction_type + "max" + suffix;
+ break;
+ case SpoofOperator::AggOp::SUM_SQ:
+ reduction_kernel_name = "reduce" + reduction_type + "sum_sq" + suffix;
+ break;
+ case SpoofOperator::AggOp::SUM:
+ reduction_kernel_name = "reduce" + reduction_type + "sum" + suffix;
+ break;
+ default:
+ std::cerr << "unknown reduction op" << std::endl;
+ return "";
+ }
+
+ return reduction_kernel_name;
+ }
+};
+
+#endif // SPOOFCUDACONTEXT_H
diff --git a/src/main/cuda/spoof-launcher/host_utils.h b/src/main/cuda/spoof-launcher/host_utils.h
new file mode 100644
index 0000000..47990ad
--- /dev/null
+++ b/src/main/cuda/spoof-launcher/host_utils.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+#ifndef HOST_UTILS_H
+#define HOST_UTILS_H
+
+#include <cuda.h>
+
+#define CHECK_CUDA(call) \
+ do { \
+ CUresult status = call; \
+ if (status != CUDA_SUCCESS) { \
+ const char* str; \
+ cuGetErrorName(status, &str); \
+ std::cout << "(CUDA) returned " << str; \
+ std::cout << " (" << __FILE__ << ":" << __LINE__ << ":" << __func__ \
+ << "())" << std::endl; \
+ } \
+ } while (0)
+
+#define CHECK_CUDART(call) \
+ do { \
+ cudaError_t status = call; \
+ if (status != cudaSuccess) { \
+ std::cout << "(CUDART) returned " << cudaGetErrorString(status); \
+ std::cout << " (" << __FILE__ << ":" << __LINE__ << ":" << __func__ \
+ << "())" << std::endl; \
+ } \
+ } while (0)
+
+#endif // HOST_UTILS_H
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.cpp b/src/main/cuda/spoof-launcher/jni_bridge.cpp
new file mode 100644
index 0000000..6645003
--- /dev/null
+++ b/src/main/cuda/spoof-launcher/jni_bridge.cpp
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "jni_bridge.h"
+#include "SpoofCUDAContext.h"
+
+// JNI Methods to get/release arrays
+#define GET_ARRAY(env, input) \
+ ((void *)env->GetPrimitiveArrayCritical(input, nullptr))
+
+#define RELEASE_ARRAY(env, java, cpp) \
+ (env->ReleasePrimitiveArrayCritical(java, cpp, 0))
+
+JNIEXPORT jlong JNICALL
+Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context(
+ JNIEnv *env, jobject jobj, jint device_id, jstring resource_path) {
+
+ const char *cstr_rp = env->GetStringUTFChars(resource_path, NULL);
+ size_t ctx = SpoofCUDAContext::initialize_cuda(device_id, cstr_rp);
+ env->ReleaseStringUTFChars(resource_path, cstr_rp);
+ return ctx;
+}
+
+JNIEXPORT void JNICALL
+Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
+ JNIEnv *env, jobject jobj, jlong ctx, jint device_id) {
+ SpoofCUDAContext::destroy_cuda(reinterpret_cast<SpoofCUDAContext *>(ctx),
+ device_id);
+}
+
+JNIEXPORT jboolean JNICALL
+Java_org_apache_sysds_hops_codegen_SpoofCompiler_compile_1cuda_1kernel(
+ JNIEnv *env, jobject jobj, jlong ctx, jstring name, jstring src) {
+ SpoofCUDAContext *ctx_ = reinterpret_cast<SpoofCUDAContext *>(ctx);
+ const char *cstr_name = env->GetStringUTFChars(name, NULL);
+ const char *cstr_src = env->GetStringUTFChars(src, NULL);
+ bool result = ctx_->compile_cuda(cstr_src, cstr_name);
+ env->ReleaseStringUTFChars(src, cstr_src);
+ env->ReleaseStringUTFChars(name, cstr_name);
+ return result;
+}
+
+JNIEXPORT jdouble JNICALL
+Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1d(
+ JNIEnv *env, jobject jobj, jlong ctx, jstring name, jlongArray in_ptrs,
+ jlongArray side_ptrs, jlong out_ptr, jdoubleArray scalars_, jlong m, jlong n, jlong grix) {
+
+ SpoofCUDAContext *ctx_ = reinterpret_cast<SpoofCUDAContext *>(ctx);
+ const char *cstr_name = env->GetStringUTFChars(name, NULL);
+
+ double **inputs = reinterpret_cast<double **>(GET_ARRAY(env, in_ptrs));
+ double **sides = reinterpret_cast<double **>(GET_ARRAY(env, side_ptrs));
+ double *scalars = reinterpret_cast<double *>(GET_ARRAY(env, scalars_));
+
+ double result = ctx_->execute_kernel(
+ cstr_name, inputs, env->GetArrayLength(in_ptrs), sides, env->GetArrayLength(side_ptrs),
+ reinterpret_cast<double*>(out_ptr), scalars, env->GetArrayLength(scalars_), m, n, grix);
+
+ RELEASE_ARRAY(env, in_ptrs, inputs);
+ RELEASE_ARRAY(env, side_ptrs, sides);
+ RELEASE_ARRAY(env, scalars_, scalars);
+
+ // FIXME: that release causes an error
+ //std::cout << "releasing " << name_ << std::endl;
+ env->ReleaseStringUTFChars(name, cstr_name);
+ return result;
+}
+
+JNIEXPORT jfloat JNICALL
+Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1f(
+ JNIEnv *env, jobject jobj, jlong ctx, jstring name, jlongArray in_ptrs,
+ jlongArray side_ptrs, jlong out_ptr, jfloatArray scalars_, jlong m, jlong n, jlong grix) {
+
+ SpoofCUDAContext *ctx_ = reinterpret_cast<SpoofCUDAContext *>(ctx);
+
+ const char *cstr_name = env->GetStringUTFChars(name, NULL);
+
+ float **inputs = reinterpret_cast<float**>(GET_ARRAY(env, in_ptrs));
+ float **sides = reinterpret_cast<float **>(GET_ARRAY(env, side_ptrs));
+ float *scalars = reinterpret_cast<float *>(GET_ARRAY(env, scalars_));
+
+ float result = ctx_->execute_kernel(
+ cstr_name, inputs, env->GetArrayLength(in_ptrs), sides, env->GetArrayLength(side_ptrs),
+ reinterpret_cast<float *>(out_ptr), scalars, env->GetArrayLength(scalars_), m, n, grix);
+
+ RELEASE_ARRAY(env, in_ptrs, inputs);
+ RELEASE_ARRAY(env, side_ptrs, sides);
+ RELEASE_ARRAY(env, scalars_, scalars);
+
+ // FIXME: that release causes an error
+ env->ReleaseStringUTFChars(name, cstr_name);
+ return result;
+}
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.h b/src/main/cuda/spoof-launcher/jni_bridge.h
new file mode 100644
index 0000000..a06bb1b
--- /dev/null
+++ b/src/main/cuda/spoof-launcher/jni_bridge.h
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/* DO NOT EDIT THIS FILE - it is machine generated */
+
+#pragma once
+#ifndef JNI_BRIDGE_H
+#define JNI_BRIDGE_H
+
+#include <jni.h>
+/* Header for class org_apache_sysds_hops_codegen_SpoofCompiler */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class: org_apache_sysds_hops_codegen_SpoofCompiler
+ * Method: initialize_cuda_context
+ * Signature: (I)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context(
+ JNIEnv *, jobject, jint, jstring);
+
+/*
+ * Class: org_apache_sysds_hops_codegen_SpoofCompiler
+ * Method: destroy_cuda_context
+ * Signature: (JI)V
+ */
+JNIEXPORT void JNICALL
+Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
+ JNIEnv *, jobject, jlong, jint);
+
+/*
+ * Class: org_apache_sysds_hops_codegen_SpoofCompiler
+ * Method: compile_cuda_kernel
+ * Signature: (JLjava/lang/String;Ljava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_org_apache_sysds_hops_codegen_SpoofCompiler_compile_1cuda_1kernel(
+ JNIEnv *, jobject, jlong, jstring, jstring);
+
+/*
+ * Class: org_apache_sysds_runtime_instructions_gpu_SpoofCUDAInstruction
+ * Method: execute_d
+ * Signature: (...)Z
+ */
+JNIEXPORT jdouble JNICALL
+Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1d(
+ JNIEnv *, jobject, jlong, jstring, jlongArray, jlongArray, jlong, jdoubleArray, jlong, jlong, jlong);
+
+/*
+ * Class: org_apache_sysds_runtime_instructions_gpu_SpoofCUDAInstruction
+ * Method: execute_f
+ * Signature: (...)Z
+ */
+JNIEXPORT jfloat JNICALL
+Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1f(
+ JNIEnv *, jobject, jlong, jstring, jlongArray, jlongArray, jlong, jfloatArray, jlong, jlong, jlong);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // JNI_BRIDGE_H
diff --git a/src/main/cuda/spoof/cellwise.cu b/src/main/cuda/spoof/cellwise.cu
new file mode 100644
index 0000000..2f76802
--- /dev/null
+++ b/src/main/cuda/spoof/cellwise.cu
@@ -0,0 +1,54 @@
+%TMP%
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// CellType: %TYPE%
+// AggOp: %AGG_OP_NAME%
+// SparseSafe: %SPARSE_SAFE%
+// SEQ: %SEQ%
+
+#include "agg_ops.cuh"
+#include "reduction.cuh"
+#include "spoof_utils.cuh"
+#include "utils.cuh"
+
+template<typename T>
+struct SpoofCellwiseOp {
+ T**b; T* scalars;
+ int m, n, grix_;
+
+ SpoofCellwiseOp(T** b, T* scalars, int m, int n, int grix) :
+ b(b), scalars(scalars), m(m), n(n), grix_(grix) {}
+
+ __device__ __forceinline__ T operator()(T a, int idx) const {
+ int rix = idx / n;
+ int cix = idx % n;
+ int grix = grix_ + rix;
+%BODY_dense%
+ return %OUT%;
+ }
+};
+
+template<typename T>
+__global__ void %TMP% (T *a, T** b, T* c, T* scalars, int m, int n, int grix) {
+ %AGG_OP%<T> agg_op;
+ SpoofCellwiseOp<T> spoof_op(b, scalars, m, n, grix);
+ %TYPE%<T, %AGG_OP%<T>, SpoofCellwiseOp<T>>(a, c, m, n, %INITIAL_VALUE%, agg_op, spoof_op);
+};
diff --git a/src/main/cuda/spoof/functions.cuh b/src/main/cuda/spoof/functions.cuh
new file mode 100644
index 0000000..55f3ee3
--- /dev/null
+++ b/src/main/cuda/spoof/functions.cuh
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+__constant__ double DOUBLE_EPS = 1.11022E-16; // 2 ^ -53
+__constant__ double FLOAT_EPS = 1.49012E-08; // 2 ^ -26
+__constant__ double EPSILON = 1E-11; // margin for comparisons ToDo: make consistent use of it
+
+__device__ unsigned long long toUInt64(double a) {
+ return (signbit(a) == 0 ? 1.0 : -1.0) * abs(floor(a + DOUBLE_EPS));
+}
+
+__device__ unsigned int toUInt32(float a) {
+ return (signbit(a) == 0 ? 1.0 : -1.0) * abs(floor(a + FLOAT_EPS));
+}
+
+template<typename T>
+__device__ T getValue(T* data, int rowIndex) {
+ return data[rowIndex];
+}
+
+template<typename T>
+__device__ T getValue(T* data, int n, int rowIndex, int colIndex) {
+ return data[rowIndex * n + colIndex];
+}
+
+template<typename T>
+__device__ T intDiv(T a, T b);
+
+template<>
+__device__ double intDiv(double a, double b) {
+ double ret = a / b;
+ return (isnan(ret) || isinf(ret)) ? ret : toUInt64(ret);
+}
+
+template<>
+__device__ float intDiv(float a, float b) {
+ float ret = a / b;
+ return (isnan(ret) || isinf(ret)) ? ret : toUInt32(ret);
+}
+
+template<typename T>
+__device__ T modulus(T a, T b);
+
+template<>
+__device__ double modulus(double a, double b) {
+ if (fabs(b) < DOUBLE_EPS)
+ return CUDART_NAN;
+ return a - intDiv(a, b) * b;
+}
+
+template<>
+__device__ float modulus(float a, float b) {
+ if (fabs(b) < FLOAT_EPS)
+ return CUDART_NAN_F;
+ return a - intDiv(a, b) * b;
+}
+
+template<typename T>
+__device__ T bwAnd(T a, T b);
+
+// ToDo: does not work with long long
+template<>
+__device__ double bwAnd(double a, double b) {
+ return (*reinterpret_cast<long*>(&a)) & (*reinterpret_cast<long*>(&b));
+}
+
+template<>
+__device__ float bwAnd(float a, float b) {
+ return (*reinterpret_cast<int*>(&a)) & (*reinterpret_cast<int*>(&b));
+}
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/api/DMLScript.java b/src/main/java/org/apache/sysds/api/DMLScript.java
index 758fde6..d32731f 100644
--- a/src/main/java/org/apache/sysds/api/DMLScript.java
+++ b/src/main/java/org/apache/sysds/api/DMLScript.java
@@ -45,6 +45,7 @@ import org.apache.sysds.conf.CompilerConfig;
import org.apache.sysds.conf.ConfigurationManager;
import org.apache.sysds.conf.DMLConfig;
import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.hops.codegen.SpoofCompiler;
import org.apache.sysds.lops.Lop;
import org.apache.sysds.parser.DMLProgram;
import org.apache.sysds.parser.DMLTranslator;
@@ -97,7 +98,7 @@ public class DMLScript
public static boolean LINEAGE_DEDUP = DMLOptions.defaultOptions.lineage_dedup; // whether deduplicate lineage items
public static ReuseCacheType LINEAGE_REUSE = DMLOptions.defaultOptions.linReuseType; // whether lineage-based reuse
public static LineageCachePolicy LINEAGE_POLICY = DMLOptions.defaultOptions.linCachePolicy; // lineage cache eviction policy
- public static boolean CHECK_PRIVACY = DMLOptions.defaultOptions.checkPrivacy; // Check which privacy constraints are loaded and checked during federated execution
+ public static boolean CHECK_PRIVACY = DMLOptions.defaultOptions.checkPrivacy; // Check which privacy constraints are loaded and checked during federated execution
public static boolean USE_ACCELERATOR = DMLOptions.defaultOptions.gpu;
public static boolean FORCE_ACCELERATOR = DMLOptions.defaultOptions.forceGPU;
@@ -179,7 +180,7 @@ public class DMLScript
* @return true if success, false otherwise
* @throws IOException If an internal IOException happens.
*/
- public static boolean executeScript( Configuration conf, String[] args )
+ public static boolean executeScript( Configuration conf, String[] args )
throws IOException, ParseException, DMLScriptException
{
//parse arguments and set execution properties
@@ -364,7 +365,10 @@ public class DMLScript
//Step 1: parse configuration files & write any configuration specific global variables
loadConfiguration(fnameOptConfig);
-
+
+ //Step 2: configure codegen
+ configureCodeGen();
+
//Step 3: parse dml script
Statistics.startCompileTimer();
ParserWrapper parser = ParserFactory.createParser();
@@ -416,7 +420,7 @@ public class DMLScript
cleanupHadoopExecution( ConfigurationManager.getDMLConfig());
}
}
-
+
/**
* Sets the global flags in DMLScript based on user provided configuration
*
@@ -493,8 +497,8 @@ public class DMLScript
//0) cleanup federated workers if necessary
FederatedData.clearFederatedWorkers();
-
- //1) cleanup scratch space (everything for current uuid)
+
+ //1) cleanup scratch space (everything for current uuid)
//(required otherwise export to hdfs would skip assumed unnecessary writes if same name)
HDFSTool.deleteFileIfExistOnHDFS( config.getTextValue(DMLConfig.SCRATCH_SPACE) + dirSuffix );
@@ -560,7 +564,7 @@ public class DMLScript
/**
* Print the error in a user friendly manner.
- *
+ *
* @param e The exception thrown.
*/
public static void errorPrint(Exception e){
@@ -584,4 +588,19 @@ public class DMLScript
sb.append("\n" + ANSI_RESET);
System.out.println(sb.toString());
}
+
+ private static void configureCodeGen() {
+ // load native codegen if configured
+ if(ConfigurationManager.isCodegenEnabled()) {
+ SpoofCompiler.GeneratorAPI configured_generator = SpoofCompiler.GeneratorAPI.valueOf(ConfigurationManager.getDMLConfig().getTextValue(DMLConfig.CODEGEN_API).toUpperCase());
+ if(configured_generator != SpoofCompiler.GeneratorAPI.JAVA) {
+ try {
+ SpoofCompiler.loadNativeCodeGenerator(configured_generator);
+ }
+ catch(Exception e) {
+ LOG.error("Failed to load native cuda codegen library\n" + e);
+ }
+ }
+ }
+ }
}
diff --git a/src/main/java/org/apache/sysds/conf/ConfigurationManager.java b/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
index fdc3602..516b956 100644
--- a/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
+++ b/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
@@ -60,10 +60,10 @@ public class ConfigurationManager
/**
- * Returns a cached JobConf object, intended for global use by all operations
+ * Returns a cached JobConf object, intended for global use by all operations
* with read-only access to job conf. This prevents to read the hadoop conf files
- * over and over again from classpath. However,
- *
+ * over and over again from classpath. However,
+ *
* @return the cached JobConf
*/
public static JobConf getCachedJobConf() {
@@ -177,11 +177,7 @@ public class ConfigurationManager
public static boolean isCodegenEnabled() {
return (getDMLConfig().getBooleanValue(DMLConfig.CODEGEN)
- || getCompilerConfigFlag(ConfigType.CODEGEN_ENABLED))
- && !DMLScript.USE_ACCELERATOR;
- //note: until codegen is supported for the GPU backend, we globally
- //disable codegen if operations are forced to the GPU to avoid
- //a counter-productive impact on performance.
+ || getCompilerConfigFlag(ConfigType.CODEGEN_ENABLED));
}
///////////////////////////////////////
diff --git a/src/main/java/org/apache/sysds/conf/DMLConfig.java b/src/main/java/org/apache/sysds/conf/DMLConfig.java
index 7bba416..9c1b65a 100644
--- a/src/main/java/org/apache/sysds/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysds/conf/DMLConfig.java
@@ -40,6 +40,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.sysds.hops.OptimizerUtils;
import org.apache.sysds.hops.codegen.SpoofCompiler.CompilerType;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
import org.apache.sysds.hops.codegen.SpoofCompiler.PlanSelector;
import org.apache.sysds.lops.Compression;
import org.apache.sysds.parser.ParseException;
@@ -72,6 +73,7 @@ public class DMLConfig
public static final String NATIVE_BLAS = "sysds.native.blas";
public static final String NATIVE_BLAS_DIR = "sysds.native.blas.directory";
public static final String CODEGEN = "sysds.codegen.enabled"; //boolean
+ public static final String CODEGEN_API = "sysds.codegen.api"; // see SpoofCompiler.API
public static final String CODEGEN_COMPILER = "sysds.codegen.compiler"; //see SpoofCompiler.CompilerType
public static final String CODEGEN_OPTIMIZER = "sysds.codegen.optimizer"; //see SpoofCompiler.PlanSelector
public static final String CODEGEN_PLANCACHE = "sysds.codegen.plancache"; //boolean
@@ -118,6 +120,7 @@ public class DMLConfig
_defaultVals.put(COMPRESSED_LOSSY, "false" );
_defaultVals.put(COMPRESSED_VALID_COMPRESSIONS, "DDC,OLE,RLE");
_defaultVals.put(CODEGEN, "false" );
+ _defaultVals.put(CODEGEN_API, GeneratorAPI.JAVA.name() );
_defaultVals.put(CODEGEN_COMPILER, CompilerType.AUTO.name() );
_defaultVals.put(CODEGEN_OPTIMIZER, PlanSelector.FUSE_COST_BASED_V2.name() );
_defaultVals.put(CODEGEN_PLANCACHE, "true" );
@@ -379,7 +382,7 @@ public class DMLConfig
LOCAL_TMP_DIR,SCRATCH_SPACE,OPTIMIZATION_LEVEL, DEFAULT_BLOCK_SIZE,
CP_PARALLEL_OPS, CP_PARALLEL_IO, NATIVE_BLAS, NATIVE_BLAS_DIR,
COMPRESSED_LINALG, COMPRESSED_LOSSY, COMPRESSED_VALID_COMPRESSIONS,
- CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS,
+ CODEGEN, CODEGEN_API, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS,
STATS_MAX_WRAP_LEN, PRINT_GPU_MEMORY_INFO,
AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY,
LOCAL_SPARK_NUM_THREADS, EVICTION_SHADOW_BUFFERSIZE, GPU_MEMORY_ALLOCATOR, GPU_MEMORY_UTILIZATION_FACTOR
diff --git a/src/main/java/org/apache/sysds/hops/codegen/SpoofCompiler.java b/src/main/java/org/apache/sysds/hops/codegen/SpoofCompiler.java
index 2d50056..d388583 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/SpoofCompiler.java
@@ -19,15 +19,16 @@
package org.apache.sysds.hops.codegen;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
import java.util.Map.Entry;
+import java.util.jar.JarEntry;
+import java.util.jar.JarFile;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.SystemUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.api.DMLScript;
@@ -82,6 +83,7 @@ import org.apache.sysds.parser.WhileStatementBlock;
import org.apache.sysds.common.Types.DataType;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.codegen.CodegenUtils;
+import org.apache.sysds.runtime.codegen.SpoofCUDA;
import org.apache.sysds.runtime.codegen.SpoofCellwise.CellType;
import org.apache.sysds.runtime.codegen.SpoofRowwise.RowType;
import org.apache.sysds.runtime.controlprogram.BasicProgramBlock;
@@ -93,36 +95,48 @@ import org.apache.sysds.runtime.controlprogram.Program;
import org.apache.sysds.runtime.controlprogram.ProgramBlock;
import org.apache.sysds.runtime.controlprogram.WhileProgramBlock;
import org.apache.sysds.runtime.instructions.Instruction;
+import org.apache.sysds.runtime.instructions.gpu.context.GPUContextPool;
import org.apache.sysds.runtime.lineage.LineageItemUtils;
import org.apache.sysds.runtime.matrix.data.Pair;
import org.apache.sysds.utils.Explain;
+import org.apache.sysds.utils.NativeHelper;
import org.apache.sysds.utils.Statistics;
-public class SpoofCompiler
-{
+public class SpoofCompiler {
private static final Log LOG = LogFactory.getLog(SpoofCompiler.class.getName());
-
+
//internal configuration flags
- public static CompilerType JAVA_COMPILER = CompilerType.JANINO;
- public static PlanSelector PLAN_SEL_POLICY = PlanSelector.FUSE_COST_BASED_V2;
+ public static CompilerType JAVA_COMPILER = CompilerType.JANINO;
+ public static PlanSelector PLAN_SEL_POLICY = PlanSelector.FUSE_COST_BASED_V2;
public static final IntegrationType INTEGRATION = IntegrationType.RUNTIME;
public static final boolean RECOMPILE_CODEGEN = true;
public static final boolean PRUNE_REDUNDANT_PLANS = true;
public static PlanCachePolicy PLAN_CACHE_POLICY = PlanCachePolicy.CSLH;
public static final int PLAN_CACHE_SIZE = 1024; //max 1K classes
public static final RegisterAlloc REG_ALLOC_POLICY = RegisterAlloc.EXACT_STATIC_BUFF;
-
+ public static GeneratorAPI API = GeneratorAPI.JAVA;
+ public static HashMap<GeneratorAPI, Long> native_contexts;
+
public enum CompilerType {
AUTO,
JAVAC,
JANINO,
+ NVCC,
+ NVRTC
}
-
+
+
+ public enum GeneratorAPI {
+ AUTO,
+ JAVA,
+ CUDA
+ }
+
public enum IntegrationType {
HOPS,
RUNTIME,
}
-
+
public enum PlanSelector {
FUSE_ALL, //maximal fusion, possible w/ redundant compute
FUSE_NO_REDUNDANCY, //fusion without redundant compute
@@ -143,18 +157,143 @@ public class SpoofCompiler
CONSTANT, //plan cache, with always compile literals
CSLH, //plan cache, with context-sensitive literal replacement heuristic
NONE; //no plan cache
-
+
public static PlanCachePolicy get(boolean planCache, boolean compileLiterals) {
return !planCache ? NONE : compileLiterals ? CONSTANT : CSLH;
}
}
-
+
public enum RegisterAlloc {
HEURISTIC, //max vector intermediates, special handling pipelines (always safe)
EXACT_DYNAMIC_BUFF, //min number of live vector intermediates, assuming dynamic pooling
EXACT_STATIC_BUFF, //min number of live vector intermediates, assuming static array ring buffer
}
-
+
+ @Override
+ protected void finalize() {
+ SpoofCompiler.cleanupCodeGenerator();
+ }
+
+ public static void loadNativeCodeGenerator(GeneratorAPI generator) {
+ if(DMLScript.getGlobalExecMode() == ExecMode.SPARK) {
+ LOG.warn("Not loading native codegen library in SPARK execution mode!\n");
+ return;
+ }
+
+ // loading cuda codegen (the only supported API atm)
+ if(generator == GeneratorAPI.AUTO && DMLScript.USE_ACCELERATOR)
+ generator = GeneratorAPI.CUDA;
+
+ if(generator == GeneratorAPI.CUDA && !DMLScript.USE_ACCELERATOR)
+ generator = GeneratorAPI.JAVA;
+
+ if(native_contexts == null)
+ native_contexts = new HashMap<>();
+
+ if(!native_contexts.containsKey(generator)) {
+ if(generator == GeneratorAPI.CUDA) {
+ // init GPUs with jCuda to avoid double initialization problems
+ GPUContextPool.initializeGPU();
+
+ String arch = SystemUtils.OS_ARCH;
+ String os = SystemUtils.OS_NAME;
+ String suffix = ".so";
+
+ if(SystemUtils.IS_OS_LINUX && SystemUtils.OS_ARCH.equalsIgnoreCase("amd64"))
+ arch = "x86_64";
+ if(SystemUtils.IS_OS_WINDOWS) {
+ os = "Windows";
+ suffix = ".dll";
+ }
+
+ String libName = "libsystemds_spoof_cuda-" + os + "-" + arch + suffix;
+
+ // ToDo: remove legacy paths
+ boolean isLoaded = NativeHelper.loadBLAS(System.getProperty("user.dir")
+ + "/src/main/cpp/lib".replace("/",File.separator), libName, "");
+
+ if(!isLoaded)
+ isLoaded = NativeHelper.loadBLAS(System.getProperty("user.dir")
+ + "/target/classes/lib".replace("/", File.separator), libName, "");
+ if(!isLoaded)
+ isLoaded = NativeHelper.loadBLAS(null, libName, "");
+ if(!isLoaded)
+ isLoaded = NativeHelper.loadLibraryHelperFromResource(libName);
+
+ if(isLoaded) {
+ String local_tmp = ConfigurationManager.getDMLConfig().getTextValue(DMLConfig.LOCAL_TMP_DIR);
+ String jar_path = SpoofCompiler.class.getProtectionDomain().getCodeSource().getLocation().getPath();
+ if(jar_path.contains(".jar")) {
+ try {
+ extractCodegenSources(local_tmp, jar_path);
+ }
+ catch (IOException e){
+ LOG.error("Could not extract spoof files from jar: " + e);
+ API = GeneratorAPI.JAVA;
+ return;
+ }
+ }
+ else {
+ local_tmp = System.getProperty("user.dir") + "/src/main".replace("/", File.separator);
+ }
+
+ long ctx_ptr = initialize_cuda_context(0, local_tmp);
+ if(ctx_ptr != 0) {
+ native_contexts.put(GeneratorAPI.CUDA, ctx_ptr);
+ API = GeneratorAPI.CUDA;
+ LOG.info("Successfully loaded spoof cuda library");
+ }
+ else {
+ API = GeneratorAPI.JAVA;
+ LOG.error("Failed to initialize spoof cuda context. Falling back to java codegen\n");
+ }
+ }
+ else {
+ API = GeneratorAPI.JAVA;
+ LOG.error("Loading of spoof native cuda failed. Falling back to java codegen\n");
+ }
+ }
+ }
+ }
+
+ public static void unloadNativeCodeGenerator() {
+ if(native_contexts.containsKey(GeneratorAPI.CUDA)) {
+ destroy_cuda_context(native_contexts.get(GeneratorAPI.CUDA), 0);
+ native_contexts.remove(GeneratorAPI.CUDA);
+ if(API == GeneratorAPI.CUDA)
+ API = GeneratorAPI.JAVA;
+ }
+ }
+
+ private static void extractCodegenSources(String resource_path, String jar_path) throws IOException {
+ JarFile jar_file = new JarFile(jar_path);
+ Enumeration<JarEntry> files_in_jar = jar_file.entries();
+
+ while (files_in_jar.hasMoreElements()) {
+ JarEntry in_file = files_in_jar.nextElement();
+ if (in_file.getName().startsWith("cuda/") && !in_file.isDirectory()) {
+ File out_file = new File(resource_path, in_file.getName());
+ out_file.deleteOnExit();
+ File parent = out_file.getParentFile();
+ if (parent != null) {
+ parent.mkdirs();
+ parent.deleteOnExit();
+ }
+ IOUtils.copy(jar_file.getInputStream(in_file), FileUtils.openOutputStream(out_file));
+ }
+ }
+ }
+
+ private static boolean compile_cuda(String name, String src) {
+ return compile_cuda_kernel(native_contexts.get(GeneratorAPI.CUDA), name, src);
+ }
+
+ private static native long initialize_cuda_context(int device_id, String resource_path);
+
+ private static native boolean compile_cuda_kernel(long ctx, String name, String src);
+
+ private static native void destroy_cuda_context(long ctx, int device_id);
+
//plan cache for cplan->compiled source to avoid unnecessary codegen/source code compile
//for equal operators from (1) different hop dags and (2) repeated recompilation
//note: if PLAN_CACHE_SIZE is exceeded, we evict the least-recently-used plan (LRU policy)
@@ -370,9 +509,23 @@ public class SpoofCompiler
Class<?> cla = planCache.getPlan(tmp.getValue());
if( cla == null ) {
- //generate java source code
- String src = tmp.getValue().codegen(false);
-
+ String src = "";
+ boolean native_compiled_successfully = false;
+
+ if(API == GeneratorAPI.CUDA && tmp.getValue().isSupported(API)) {
+ src = tmp.getValue().codegen(false, GeneratorAPI.CUDA);
+ native_compiled_successfully = compile_cuda(tmp.getValue().getVarname(), src);
+ if (native_compiled_successfully)
+ CodegenUtils.putNativeOpData(new SpoofCUDA(tmp.getValue()));
+ else
+ LOG.warn("CUDA compilation failed, falling back to JAVA");
+ }
+
+ if(API == GeneratorAPI.JAVA || !native_compiled_successfully) {
+ src = tmp.getValue().codegen(false, GeneratorAPI.JAVA);
+ cla = CodegenUtils.compileClass("codegen."+ tmp.getValue().getClassname(), src);
+ }
+
//explain debug output cplans or generated source code
if( LOG.isTraceEnabled() || DMLScript.EXPLAIN.isHopsType(recompile) ) {
LOG.info("Codegen EXPLAIN (generated cplan for HopID: " + cplan.getKey() +
@@ -385,11 +538,7 @@ public class SpoofCompiler
", line "+tmp.getValue().getBeginLine() + ", hash="+tmp.getValue().hashCode()+"):");
LOG.info(src);
}
-
- //compile generated java source code
- cla = CodegenUtils.compileClass("codegen."+
- tmp.getValue().getClassname(), src);
-
+
//maintain plan cache
if( PLAN_CACHE_POLICY!=PlanCachePolicy.NONE )
planCache.putPlan(tmp.getValue(), cla);
@@ -399,7 +548,7 @@ public class SpoofCompiler
}
//make class available and maintain hits
- if(cla != null)
+ if(cla != null || API != GeneratorAPI.JAVA)
clas.put(cplan.getKey(), new Pair<Hop[],Class<?>>(tmp.getKey(),cla));
if( DMLScript.STATISTICS )
Statistics.incrementCodegenOpCacheTotal();
@@ -442,6 +591,10 @@ public class SpoofCompiler
CodegenUtils.clearClassCache(); //class cache
planCache.clear(); //plan cache
}
+
+ if(API != GeneratorAPI.JAVA)
+ unloadNativeCodeGenerator();
+
}
/**
@@ -594,7 +747,7 @@ public class SpoofCompiler
CNodeTpl tmpCNode = cplans.get(hop.getHopID()).getValue();
hnew = new SpoofFusedOp(hop.getName(), hop.getDataType(), hop.getValueType(),
- tmpCla.getValue(), false, tmpCNode.getOutputDimType());
+ tmpCla.getValue(), tmpCNode.getGeneratorAPI(), tmpCNode.getVarname(), false, tmpCNode.getOutputDimType());
Hop[] inHops = tmpCla.getKey();
diff --git a/src/main/java/org/apache/sysds/hops/codegen/SpoofFusedOp.java b/src/main/java/org/apache/sysds/hops/codegen/SpoofFusedOp.java
index 3aca219..598d956 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/SpoofFusedOp.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/SpoofFusedOp.java
@@ -21,6 +21,7 @@ package org.apache.sysds.hops.codegen;
import org.apache.sysds.common.Types.DataType;
import org.apache.sysds.common.Types.ValueType;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
import org.apache.sysds.hops.Hop;
import org.apache.sysds.hops.MemoTable;
import org.apache.sysds.hops.MultiThreadedHop;
@@ -33,6 +34,7 @@ import org.apache.sysds.runtime.meta.DataCharacteristics;
import org.apache.sysds.runtime.meta.MatrixCharacteristics;
import java.util.ArrayList;
+import java.util.Objects;
public class SpoofFusedOp extends MultiThreadedHop
{
@@ -55,16 +57,21 @@ public class SpoofFusedOp extends MultiThreadedHop
private boolean _distSupported = false;
private long _constDim2 = -1;
private SpoofOutputDimsType _dimsType;
-
+ private GeneratorAPI _api = GeneratorAPI.JAVA;
+ private String _genVarName;
+
public SpoofFusedOp ( ) {
}
- public SpoofFusedOp( String name, DataType dt, ValueType vt, Class<?> cla, boolean dist, SpoofOutputDimsType type ) {
+ public SpoofFusedOp( String name, DataType dt, ValueType vt, Class<?> cla, GeneratorAPI api, String genVarName,
+ boolean dist, SpoofOutputDimsType type ) {
super(name, dt, vt);
_class = cla;
_distSupported = dist;
_dimsType = type;
+ _api = api;
+ _genVarName = genVarName;
}
@Override
@@ -81,7 +88,10 @@ public class SpoofFusedOp extends MultiThreadedHop
@Override
public boolean isGPUEnabled() {
- return false;
+ if(_api == GeneratorAPI.CUDA)
+ return true;
+ else
+ return false;
}
@Override
@@ -91,10 +101,13 @@ public class SpoofFusedOp extends MultiThreadedHop
@Override
protected double computeOutputMemEstimate(long dim1, long dim2, long nnz) {
- return _class.getGenericSuperclass().equals(SpoofRowwise.class) ?
- OptimizerUtils.estimateSize(dim1, dim2) :
- OptimizerUtils.estimatePartitionedSizeExactSparsity(
- dim1, dim2, getBlocksize(), nnz);
+ if(_api == GeneratorAPI.JAVA) {
+ return _class.getGenericSuperclass().equals(SpoofRowwise.class) ?
+ OptimizerUtils.estimateSize(dim1, dim2) :
+ OptimizerUtils.estimatePartitionedSizeExactSparsity(dim1, dim2, getBlocksize(), nnz);
+ }
+ else
+ return OptimizerUtils.estimatePartitionedSizeExactSparsity(dim1, dim2, getBlocksize(), nnz);
}
@Override
@@ -114,7 +127,7 @@ public class SpoofFusedOp extends MultiThreadedHop
inputs.add(c.constructLops());
int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
- SpoofFused lop = new SpoofFused(inputs, getDataType(), getValueType(), _class, k, et);
+ SpoofFused lop = new SpoofFused(inputs, getDataType(), getValueType(), _class, _api, _genVarName, k, et);
setOutputDimensions(lop);
setLineNumbers(lop);
setLops(lop);
@@ -140,12 +153,16 @@ public class SpoofFusedOp extends MultiThreadedHop
@Override
public String getOpString() {
- return "spoof("+_class.getSimpleName()+")";
- }
+ if(_class != null)
+ return "spoof("+_class.getSimpleName()+")";
+ else
+ return "spoof(" + getName() + ")"; }
public String getClassName() {
- return _class.getName();
- }
+ if(_class != null)
+ return _class.getName();
+ else
+ return "spoof" + getName(); }
@Override
protected DataCharacteristics inferOutputCharacteristics( MemoTable memo )
@@ -297,11 +314,12 @@ public class SpoofFusedOp extends MultiThreadedHop
SpoofFusedOp that2 = (SpoofFusedOp)that;
//note: class implies dims type as well
- boolean ret = ( _class.equals(that2._class)
+ boolean ret = (Objects.equals(_class, that2._class)
&& _distSupported == that2._distSupported
&& _maxNumThreads == that2._maxNumThreads
&& _constDim2 == that2._constDim2
- && getInput().size() == that2.getInput().size());
+ && getInput().size() == that2.getInput().size()
+ && _api == that2._api);
if( ret ) {
for( int i=0; i<getInput().size(); i++ )
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNode.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNode.java
index 38a7bc3..a2f918e 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNode.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNode.java
@@ -25,6 +25,9 @@ import org.apache.sysds.hops.codegen.template.TemplateUtils;
import org.apache.sysds.common.Types.DataType;
import org.apache.sysds.runtime.controlprogram.parfor.util.IDSequence;
import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
+
+import static org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI.CUDA;
public abstract class CNode
{
@@ -79,7 +82,10 @@ public abstract class CNode
public String getVarname() {
return _genVar;
}
-
+
+ public String getVarname(GeneratorAPI api) { return getVarname(); }
+
+
public String getVectorLength() {
if( getVarname().startsWith("a") )
return "len";
@@ -161,7 +167,7 @@ public abstract class CNode
setVisited(false);
}
- public abstract String codegen(boolean sparse);
+ public abstract String codegen(boolean sparse, GeneratorAPI api);
public abstract void setOutputDims();
@@ -228,4 +234,36 @@ public abstract class CNode
return tmp;
}
+
+ protected CodeTemplate getLanguageTemplateClass(CNode caller, GeneratorAPI api) {
+ switch (api) {
+ case CUDA:
+ if(caller instanceof CNodeCell)
+ return new org.apache.sysds.hops.codegen.cplan.cpp.CellWise();
+ else if (caller instanceof CNodeUnary)
+ return new org.apache.sysds.hops.codegen.cplan.cpp.Unary();
+ else if (caller instanceof CNodeBinary)
+ return new org.apache.sysds.hops.codegen.cplan.cpp.Binary();
+ else if (caller instanceof CNodeTernary)
+ return new org.apache.sysds.hops.codegen.cplan.cpp.Ternary();
+ else
+ return null;
+ case JAVA:
+ if(caller instanceof CNodeCell)
+ return new org.apache.sysds.hops.codegen.cplan.java.CellWise();
+ else if (caller instanceof CNodeUnary)
+ return new org.apache.sysds.hops.codegen.cplan.java.Unary();
+ else if (caller instanceof CNodeBinary)
+ return new org.apache.sysds.hops.codegen.cplan.java.Binary();
+ else if (caller instanceof CNodeTernary)
+ return new org.apache.sysds.hops.codegen.cplan.java.Ternary();
+
+ else
+ return null;
+ default:
+ throw new RuntimeException("API not supported by code generator: " + api.toString());
+ }
+ }
+
+ public abstract boolean isSupported(GeneratorAPI api);
}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeBinary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeBinary.java
index eed8389..15a26bc 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeBinary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeBinary.java
@@ -25,10 +25,10 @@ import org.apache.commons.lang.StringUtils;
import org.apache.sysds.hops.codegen.template.TemplateUtils;
import org.apache.sysds.common.Types.DataType;
import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
+public class CNodeBinary extends CNode {
-public class CNodeBinary extends CNode
-{
public enum BinType {
//matrix multiplication operations
DOT_PRODUCT, VECT_MATRIXMULT, VECT_OUTERMULT_ADD,
@@ -76,154 +76,6 @@ public class CNodeBinary extends CNode
return ssComm || vsComm || vvComm;
}
- public String getTemplate(boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) {
- switch (this) {
- case DOT_PRODUCT:
- return sparseLhs ? " double %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" :
- " double %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
- case VECT_MATRIXMULT:
- return sparseLhs ? " double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" :
- " double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
- case VECT_OUTERMULT_ADD:
- return sparseLhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
- sparseRhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2v%, %OUT%, %POS1%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
- " LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
-
- //vector-scalar-add operations
- case VECT_MULT_ADD:
- case VECT_DIV_ADD:
- case VECT_MINUS_ADD:
- case VECT_PLUS_ADD:
- case VECT_POW_ADD:
- case VECT_XOR_ADD:
- case VECT_MIN_ADD:
- case VECT_MAX_ADD:
- case VECT_EQUAL_ADD:
- case VECT_NOTEQUAL_ADD:
- case VECT_LESS_ADD:
- case VECT_LESSEQUAL_ADD:
- case VECT_GREATER_ADD:
- case VECT_GREATEREQUAL_ADD:
- case VECT_CBIND_ADD: {
- String vectName = getVectorPrimitiveName();
- if( scalarVector )
- return sparseLhs ? " LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" :
- " LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
- else
- return sparseLhs ? " LibSpoofPrimitives.vect"+vectName+"Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" :
- " LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
- }
-
- //vector-scalar operations
- case VECT_MULT_SCALAR:
- case VECT_DIV_SCALAR:
- case VECT_MINUS_SCALAR:
- case VECT_PLUS_SCALAR:
- case VECT_POW_SCALAR:
- case VECT_XOR_SCALAR:
- case VECT_BITWAND_SCALAR:
- case VECT_MIN_SCALAR:
- case VECT_MAX_SCALAR:
- case VECT_EQUAL_SCALAR:
- case VECT_NOTEQUAL_SCALAR:
- case VECT_LESS_SCALAR:
- case VECT_LESSEQUAL_SCALAR:
- case VECT_GREATER_SCALAR:
- case VECT_GREATEREQUAL_SCALAR: {
- String vectName = getVectorPrimitiveName();
- if( scalarVector )
- return sparseRhs ? " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" :
- " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS2%, %LEN%);\n";
- else
- return sparseLhs ? " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" :
- " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
- }
-
- case VECT_CBIND:
- if( scalarInput )
- return " double[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%);\n";
- else
- return sparseLhs ?
- " double[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" :
- " double[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%, %POS1%, %LEN%);\n";
-
- //vector-vector operations
- case VECT_MULT:
- case VECT_DIV:
- case VECT_MINUS:
- case VECT_PLUS:
- case VECT_XOR:
- case VECT_BITWAND:
- case VECT_BIASADD:
- case VECT_BIASMULT:
- case VECT_MIN:
- case VECT_MAX:
- case VECT_EQUAL:
- case VECT_NOTEQUAL:
- case VECT_LESS:
- case VECT_LESSEQUAL:
- case VECT_GREATER:
- case VECT_GREATEREQUAL: {
- String vectName = getVectorPrimitiveName();
- return sparseLhs ?
- " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN%);\n" :
- sparseRhs ?
- " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %POS1%, %IN2i%, %POS2%, alen, %LEN%);\n" :
- " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
- }
-
- //scalar-scalar operations
- case MULT:
- return " double %TMP% = %IN1% * %IN2%;\n";
-
- case DIV:
- return " double %TMP% = %IN1% / %IN2%;\n";
- case PLUS:
- return " double %TMP% = %IN1% + %IN2%;\n";
- case MINUS:
- return " double %TMP% = %IN1% - %IN2%;\n";
- case MODULUS:
- return " double %TMP% = LibSpoofPrimitives.mod(%IN1%, %IN2%);\n";
- case INTDIV:
- return " double %TMP% = LibSpoofPrimitives.intDiv(%IN1%, %IN2%);\n";
- case LESS:
- return " double %TMP% = (%IN1% < %IN2%) ? 1 : 0;\n";
- case LESSEQUAL:
- return " double %TMP% = (%IN1% <= %IN2%) ? 1 : 0;\n";
- case GREATER:
- return " double %TMP% = (%IN1% > %IN2%) ? 1 : 0;\n";
- case GREATEREQUAL:
- return " double %TMP% = (%IN1% >= %IN2%) ? 1 : 0;\n";
- case EQUAL:
- return " double %TMP% = (%IN1% == %IN2%) ? 1 : 0;\n";
- case NOTEQUAL:
- return " double %TMP% = (%IN1% != %IN2%) ? 1 : 0;\n";
-
- case MIN:
- return " double %TMP% = Math.min(%IN1%, %IN2%);\n";
- case MAX:
- return " double %TMP% = Math.max(%IN1%, %IN2%);\n";
- case LOG:
- return " double %TMP% = Math.log(%IN1%)/Math.log(%IN2%);\n";
- case LOG_NZ:
- return " double %TMP% = (%IN1% == 0) ? 0 : Math.log(%IN1%)/Math.log(%IN2%);\n";
- case POW:
- return " double %TMP% = Math.pow(%IN1%, %IN2%);\n";
- case MINUS1_MULT:
- return " double %TMP% = 1 - %IN1% * %IN2%;\n";
- case MINUS_NZ:
- return " double %TMP% = (%IN1% != 0) ? %IN1% - %IN2% : 0;\n";
- case XOR:
- return " double %TMP% = ( (%IN1% != 0) != (%IN2% != 0) ) ? 1 : 0;\n";
- case BITWAND:
- return " double %TMP% = LibSpoofPrimitives.bwAnd(%IN1%, %IN2%);\n";
- case SEQ_RIX:
- return " double %TMP% = %IN1% + grix * %IN2%;\n"; //0-based global rix
-
- default:
- throw new RuntimeException("Invalid binary type: "+this.toString());
- }
- }
public boolean isVectorPrimitive() {
return isVectorScalarPrimitive()
|| isVectorVectorPrimitive()
@@ -286,15 +138,15 @@ public class CNodeBinary extends CNode
}
@Override
- public String codegen(boolean sparse) {
+ public String codegen(boolean sparse, GeneratorAPI api) {
if( isGenerated() )
return "";
StringBuilder sb = new StringBuilder();
//generate children
- sb.append(_inputs.get(0).codegen(sparse));
- sb.append(_inputs.get(1).codegen(sparse));
+ sb.append(_inputs.get(0).codegen(sparse, api));
+ sb.append(_inputs.get(1).codegen(sparse, api));
//generate binary operation (use sparse template, if data input)
boolean lsparseLhs = sparse && _inputs.get(0) instanceof CNodeData
@@ -305,12 +157,14 @@ public class CNodeBinary extends CNode
boolean scalarVector = (_inputs.get(0).getDataType().isScalar()
&& _inputs.get(1).getDataType().isMatrix());
String var = createVarname();
- String tmp = _type.getTemplate(lsparseLhs, lsparseRhs, scalarVector, scalarInput);
+// String tmp = _type.getTemplate(api, lang, lsparseLhs, lsparseRhs, scalarVector, scalarInput);
+ String tmp = getLanguageTemplateClass(this, api).getTemplate(_type, lsparseLhs, lsparseRhs, scalarVector, scalarInput);
+
tmp = tmp.replace("%TMP%", var);
//replace input references and start indexes
for( int j=0; j<2; j++ ) {
- String varj = _inputs.get(j).getVarname();
+ String varj = _inputs.get(j).getVarname(api);
//replace sparse and dense inputs
tmp = tmp.replace("%IN"+(j+1)+"v%", varj+"vals");
@@ -560,4 +414,15 @@ public class CNodeBinary extends CNode
return super.equals(that)
&& _type == that._type;
}
+
+ @Override
+ public boolean isSupported(GeneratorAPI api) {
+ boolean is_supported = (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA);
+ int i = 0;
+ while(is_supported && i < _inputs.size()) {
+ CNode in = _inputs.get(i++);
+ is_supported = in.isSupported(api);
+ }
+ return is_supported;
+ }
}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
index a894bae..3ea3d3b 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
@@ -22,31 +22,14 @@ package org.apache.sysds.hops.codegen.cplan;
import java.util.ArrayList;
import org.apache.sysds.common.Types.AggOp;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
import org.apache.sysds.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
import org.apache.sysds.runtime.codegen.SpoofCellwise.CellType;
import org.apache.sysds.runtime.util.UtilFunctions;
public class CNodeCell extends CNodeTpl
-{
- private static final String TEMPLATE =
- "package codegen;\n"
- + "import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;\n"
- + "import org.apache.sysds.runtime.codegen.SpoofCellwise;\n"
- + "import org.apache.sysds.runtime.codegen.SpoofCellwise.AggOp;\n"
- + "import org.apache.sysds.runtime.codegen.SpoofCellwise.CellType;\n"
- + "import org.apache.sysds.runtime.codegen.SpoofOperator.SideInput;\n"
- + "import org.apache.commons.math3.util.FastMath;\n"
- + "\n"
- + "public final class %TMP% extends SpoofCellwise {\n"
- + " public %TMP%() {\n"
- + " super(CellType.%TYPE%, %SPARSE_SAFE%, %SEQ%, %AGG_OP%);\n"
- + " }\n"
- + " protected double genexec(double a, SideInput[] b, double[] scalars, int m, int n, long grix, int rix, int cix) { \n"
- + "%BODY_dense%"
- + " return %OUT%;\n"
- + " }\n"
- + "}\n";
-
+{
private CellType _type = null;
private AggOp _aggOp = null;
private boolean _sparseSafe = false;
@@ -83,7 +66,25 @@ public class CNodeCell extends CNodeTpl
public AggOp getAggOp() {
return _aggOp;
}
-
+
+ public SpoofCellwise.AggOp getSpoofAggOp() {
+ if(_aggOp != null)
+ switch(_aggOp) {
+ case SUM:
+ return SpoofCellwise.AggOp.SUM;
+ case SUM_SQ:
+ return SpoofCellwise.AggOp.SUM_SQ;
+ case MIN:
+ return SpoofCellwise.AggOp.MIN;
+ case MAX:
+ return SpoofCellwise.AggOp.MAX;
+ default:
+ throw new RuntimeException("Unsupported cell type: "+_type.toString());
+ }
+ else
+ return null;
+ }
+
public void setSparseSafe(boolean flag) {
_sparseSafe = flag;
}
@@ -114,34 +115,63 @@ public class CNodeCell extends CNodeTpl
rRenameDataNode(_output, _inputs.get(0), "a");
renameInputs(_inputs, 1);
}
-
- @Override
- public String codegen(boolean sparse) {
- String tmp = TEMPLATE;
-
+
+ public String codegen(boolean sparse, GeneratorAPI _api) {
+ api = _api;
+
+ String tmp = getLanguageTemplateClass(this, api).getTemplate(_type);
+
//generate dense/sparse bodies
- String tmpDense = _output.codegen(false);
+ String tmpDense = _output.codegen(false, api);
_output.resetGenerated();
tmp = tmp.replace("%TMP%", createVarname());
tmp = tmp.replace("%BODY_dense%", tmpDense);
//return last TMP
- tmp = tmp.replace("%OUT%", _output.getVarname());
-
+ tmp = tmp.replaceAll("%OUT%", _output.getVarname());
+
//replace meta data information
- tmp = tmp.replace("%TYPE%", getCellType().name());
- tmp = tmp.replace("%AGG_OP%", (_aggOp!=null) ? "AggOp."+_aggOp.name() : "null" );
+ tmp = tmp.replaceAll("%TYPE%", getCellType().name());
+ tmp = tmp.replace("%AGG_OP_NAME%", (_aggOp != null) ? "AggOp." + _aggOp.name() : "null");
tmp = tmp.replace("%SPARSE_SAFE%", String.valueOf(isSparseSafe()));
tmp = tmp.replace("%SEQ%", String.valueOf(containsSeq()));
-
+
+ if(api == GeneratorAPI.CUDA) {
+ // ToDo: initial_value is misused to pass VT (values per thread) to no_agg operator
+ String agg_op = "IdentityOp";
+ String initial_value = "(T)4.0";
+ if(_aggOp != null)
+ switch(_aggOp) {
+ case SUM:
+ agg_op = "SumOp";
+ initial_value = "(T)0.0";
+ break;
+ case SUM_SQ:
+ agg_op = "SumSqOp";
+ initial_value = "(T)0.0";
+ break;
+ case MIN:
+ agg_op = "MinOp";
+ initial_value = "MAX<T>()";
+ break;
+ case MAX:
+ agg_op = "MaxOp";
+ initial_value = "-MAX<T>()";
+ break;
+ default:
+ agg_op = "IdentityOp";
+ initial_value = "(T)0.0";
+ }
+
+ tmp = tmp.replaceAll("%AGG_OP%", agg_op);
+ tmp = tmp.replaceAll("%INITIAL_VALUE%", initial_value);
+ }
return tmp;
}
@Override
public void setOutputDims() {
-
-
}
@Override
@@ -206,4 +236,8 @@ public class CNodeCell extends CNodeTpl
sb.append("]");
return sb.toString();
}
+ @Override
+ public boolean isSupported(GeneratorAPI api) {
+ return (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA) && _output.isSupported(api);
+ }
}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeData.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeData.java
index 11d893e..b91c66f 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeData.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeData.java
@@ -19,9 +19,13 @@
package org.apache.sysds.hops.codegen.cplan;
+import org.apache.commons.lang.StringUtils;
import org.apache.sysds.hops.Hop;
import org.apache.sysds.common.Types.DataType;
+import org.apache.sysds.hops.codegen.SpoofCompiler;
import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
+import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision;
public class CNodeData extends CNode
{
@@ -54,17 +58,48 @@ public class CNodeData extends CNode
@Override
public String getVarname() {
- if( "NaN".equals(_name) )
+ if ("NaN".equals(_name))
return "Double.NaN";
- else if( "Infinity".equals(_name) )
+ else if ("Infinity".equals(_name))
return "Double.POSITIVE_INFINITY";
- else if( "-Infinity".equals(_name) )
+ else if ("-Infinity".equals(_name))
return "Double.NEGATIVE_INFINITY";
- else if( "true".equals(_name) || "false".equals(_name) )
+ else if ("true".equals(_name) || "false".equals(_name))
return "true".equals(_name) ? "1d" : "0d";
else
return _name;
}
+
+ public String getVarname(GeneratorAPI api) {
+ if(api == GeneratorAPI.JAVA) {
+ if ("NaN".equals(_name))
+ return "Double.NaN";
+ else if ("Infinity".equals(_name))
+ return "Double.POSITIVE_INFINITY";
+ else if ("-Infinity".equals(_name))
+ return "Double.NEGATIVE_INFINITY";
+ else if ("true".equals(_name) || "false".equals(_name))
+ return "true".equals(_name) ? "1d" : "0d";
+ else
+ return _name;
+ }
+ else if(api == GeneratorAPI.CUDA) {
+ if ("NaN".equals(_name))
+ return isSinglePrecision() ? "CUDART_NAN_F" : "CUDART_NAN";
+ else if ("Infinity".equals(_name))
+ return isSinglePrecision() ? "CUDART_INF_F" : "CUDART_INF";
+ else if ("-Infinity".equals(_name))
+ return isSinglePrecision() ? "-CUDART_INF_F" : "-CUDART_INF";
+ else if ("true".equals(_name) || "false".equals(_name))
+ return "true".equals(_name) ? "1" : "0";
+ else if (StringUtils.isNumeric(_name))
+ return isSinglePrecision() ? _name + ".0f" : _name + ".0";
+ else
+ return _name;
+ }
+ else
+ throw new RuntimeException("Unknown GeneratorAPI: " + SpoofCompiler.API);
+ }
public long getHopID() {
return _hopID;
@@ -80,7 +115,7 @@ public class CNodeData extends CNode
}
@Override
- public String codegen(boolean sparse) {
+ public String codegen(boolean sparse, GeneratorAPI api) {
return "";
}
@@ -113,4 +148,8 @@ public class CNodeData extends CNode
_name.equals(((CNodeData)o)._name) :
_hopID == ((CNodeData)o)._hopID));
}
+ @Override
+ public boolean isSupported(GeneratorAPI api) {
+ return true;
+ }
}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeMultiAgg.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeMultiAgg.java
index 2a5dec8..895d945 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeMultiAgg.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeMultiAgg.java
@@ -22,11 +22,12 @@ package org.apache.sysds.hops.codegen.cplan;
import java.util.ArrayList;
import java.util.Arrays;
+import org.apache.commons.collections.CollectionUtils;
import org.apache.sysds.hops.Hop;
import org.apache.sysds.common.Types.AggOp;
import org.apache.sysds.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
-import org.apache.sysds.runtime.util.CollectionUtils;
import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
public class CNodeMultiAgg extends CNodeTpl
{
@@ -105,14 +106,14 @@ public class CNodeMultiAgg extends CNodeTpl
}
@Override
- public String codegen(boolean sparse) {
+ public String codegen(boolean sparse, GeneratorAPI api) {
// note: ignore sparse flag, generate both
String tmp = TEMPLATE;
//generate dense/sparse bodies
StringBuilder sb = new StringBuilder();
for( CNode out : _outputs )
- sb.append(out.codegen(false));
+ sb.append(out.codegen(false, api));
for( CNode out : _outputs )
out.resetGenerated();
@@ -181,7 +182,7 @@ public class CNodeMultiAgg extends CNodeTpl
return false;
CNodeMultiAgg that = (CNodeMultiAgg)o;
return super.equals(o)
- && CollectionUtils.equals(_aggOps, that._aggOps)
+ && CollectionUtils.isEqualCollection(_aggOps, that._aggOps)
&& equalInputReferences(
_outputs, that._outputs, _inputs, that._inputs);
}
@@ -205,4 +206,14 @@ public class CNodeMultiAgg extends CNodeTpl
return null;
}
}
+ @Override
+ public boolean isSupported(GeneratorAPI api) {
+ boolean is_supported = (api == GeneratorAPI.JAVA);
+ int i = 0;
+ while(is_supported && i < _inputs.size()) {
+ CNode in = _inputs.get(i++);
+ is_supported = in.isSupported(api);
+ }
+ return is_supported;
+ }
}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeNary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeNary.java
index a1b110d..5500ddb 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeNary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeNary.java
@@ -27,6 +27,7 @@ import org.apache.sysds.hops.codegen.template.TemplateUtils;
import org.apache.sysds.common.Types.DataType;
import org.apache.sysds.runtime.util.DnnUtils;
import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
public class CNodeNary extends CNode
{
@@ -43,7 +44,7 @@ public class CNodeNary extends CNode
return true;
return false;
}
- public String getTemplate(boolean sparseGen, long len, ArrayList<CNode> inputs) {
+ public String getTemplate(boolean sparseGen, long len, ArrayList<CNode> inputs, GeneratorAPI api) {
switch (this) {
case VECT_CBIND:
StringBuilder sb = new StringBuilder();
@@ -111,7 +112,7 @@ public class CNodeNary extends CNode
}
@Override
- public String codegen(boolean sparse) {
+ public String codegen(boolean sparse, GeneratorAPI api) {
if( isGenerated() )
return "";
@@ -119,14 +120,14 @@ public class CNodeNary extends CNode
//generate children
for(CNode in : _inputs)
- sb.append(in.codegen(sparse));
+ sb.append(in.codegen(sparse, api));
//generate nary operation (use sparse template, if data input)
boolean lsparse = sparse && (_inputs.get(0) instanceof CNodeData
&& _inputs.get(0).getVarname().startsWith("a")
&& !_inputs.get(0).isLiteral());
String var = createVarname();
- String tmp = _type.getTemplate(lsparse, _cols, _inputs);
+ String tmp = _type.getTemplate(lsparse, _cols, _inputs, api);
tmp = tmp.replace("%TMP%", var);
//replace sparse and dense inputs
@@ -219,7 +220,18 @@ public class CNodeNary extends CNode
return super.equals(that)
&& _type == that._type;
}
-
+
+ @Override
+ public boolean isSupported(GeneratorAPI api) {
+ boolean is_supported = (api == GeneratorAPI.JAVA);
+ int i = 0;
+ while(is_supported && i < _inputs.size()) {
+ CNode in = _inputs.get(i++);
+ is_supported = in.isSupported(api);
+ }
+ return is_supported;
+ }
+
private static String getDnnParameterString(List<CNode> inputs, boolean unary) {
int off = unary ? 0 : 1;
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeOuterProduct.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeOuterProduct.java
index 9f0aa69..6a3a647 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeOuterProduct.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeOuterProduct.java
@@ -25,7 +25,7 @@ import org.apache.sysds.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
import org.apache.sysds.lops.MMTSJ;
import org.apache.sysds.runtime.codegen.SpoofOuterProduct.OutProdType;
import org.apache.sysds.runtime.util.UtilFunctions;
-
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
public class CNodeOuterProduct extends CNodeTpl
{
@@ -78,12 +78,12 @@ public class CNodeOuterProduct extends CNodeTpl
}
@Override
- public String codegen(boolean sparse) {
+ public String codegen(boolean sparse, GeneratorAPI api) {
// note: ignore sparse flag, generate both
String tmp = TEMPLATE;
//generate dense/sparse bodies
- String tmpDense = _output.codegen(false);
+ String tmpDense = _output.codegen(false, api);
_output.resetGenerated();
tmp = tmp.replace("%TMP%", createVarname());
@@ -186,4 +186,15 @@ public class CNodeOuterProduct extends CNodeTpl
sb.append("]");
return sb.toString();
}
+
+ @Override
+ public boolean isSupported(GeneratorAPI api) {
+ boolean is_supported = (api == GeneratorAPI.JAVA);
+ int i = 0;
+ while(is_supported && i < _inputs.size()) {
+ CNode in = _inputs.get(i++);
+ is_supported = in.isSupported(api);
+ }
+ return is_supported;
+ }
}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java
index 94f9ed9..b3304bd 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java
@@ -26,6 +26,7 @@ import org.apache.sysds.hops.codegen.cplan.CNodeBinary.BinType;
import org.apache.sysds.hops.codegen.template.TemplateUtils;
import org.apache.sysds.runtime.codegen.SpoofRowwise.RowType;
import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
public class CNodeRow extends CNodeTpl
{
@@ -95,15 +96,15 @@ public class CNodeRow extends CNodeTpl
}
@Override
- public String codegen(boolean sparse) {
+ public String codegen(boolean sparse, GeneratorAPI api) {
// note: ignore sparse flag, generate both
String tmp = TEMPLATE;
//generate dense/sparse bodies
- String tmpDense = _output.codegen(false)
+ String tmpDense = _output.codegen(false, api)
+ getOutputStatement(_output.getVarname());
_output.resetGenerated();
- String tmpSparse = _output.codegen(true)
+ String tmpSparse = _output.codegen(true, api)
+ getOutputStatement(_output.getVarname());
tmp = tmp.replace("%TMP%", createVarname());
tmp = tmp.replace("%BODY_dense%", tmpDense);
@@ -209,4 +210,15 @@ public class CNodeRow extends CNodeTpl
sb.append("]");
return sb.toString();
}
+
+ @Override
+ public boolean isSupported(GeneratorAPI api) {
+ boolean is_supported = (api == GeneratorAPI.JAVA);
+ int i = 0;
+ while(is_supported && i < _inputs.size()) {
+ CNode in = _inputs.get(i++);
+ is_supported = in.isSupported(api);
+ }
+ return is_supported;
+ }
}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTernary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTernary.java
index 8939db9..5e81109 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTernary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTernary.java
@@ -23,7 +23,7 @@ import java.util.Arrays;
import org.apache.sysds.common.Types.DataType;
import org.apache.sysds.runtime.util.UtilFunctions;
-
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
public class CNodeTernary extends CNode
{
@@ -37,43 +37,7 @@ public class CNodeTernary extends CNode
return Arrays.stream(values()).anyMatch(tt -> tt.name().equals(value));
}
- public String getTemplate(boolean sparse) {
- switch (this) {
- case PLUS_MULT:
- return " double %TMP% = %IN1% + %IN2% * %IN3%;\n";
-
- case MINUS_MULT:
- return " double %TMP% = %IN1% - %IN2% * %IN3%;\n";
-
- case BIASADD:
- return " double %TMP% = %IN1% + getValue(%IN2%, cix/%IN3%);\n";
-
- case BIASMULT:
- return " double %TMP% = %IN1% * getValue(%IN2%, cix/%IN3%);\n";
-
- case REPLACE:
- return " double %TMP% = (%IN1% == %IN2% || (Double.isNaN(%IN1%) "
- + "&& Double.isNaN(%IN2%))) ? %IN3% : %IN1%;\n";
-
- case REPLACE_NAN:
- return " double %TMP% = Double.isNaN(%IN1%) ? %IN3% : %IN1%;\n";
-
- case IFELSE:
- return " double %TMP% = (%IN1% != 0) ? %IN2% : %IN3%;\n";
-
- case LOOKUP_RC1:
- return sparse ?
- " double %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, %IN3%-1);\n" :
- " double %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n";
-
- case LOOKUP_RVECT1:
- return " double[] %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1);\n";
-
- default:
- throw new RuntimeException("Invalid ternary type: "+this.toString());
- }
- }
-
+
public boolean isVectorPrimitive() {
return (this == LOOKUP_RVECT1);
}
@@ -94,23 +58,25 @@ public class CNodeTernary extends CNode
}
@Override
- public String codegen(boolean sparse) {
+ public String codegen(boolean sparse, GeneratorAPI api) {
if( isGenerated() )
return "";
StringBuilder sb = new StringBuilder();
//generate children
- sb.append(_inputs.get(0).codegen(sparse));
- sb.append(_inputs.get(1).codegen(sparse));
- sb.append(_inputs.get(2).codegen(sparse));
+ sb.append(_inputs.get(0).codegen(sparse, api));
+ sb.append(_inputs.get(1).codegen(sparse, api));
+ sb.append(_inputs.get(2).codegen(sparse, api));
//generate binary operation
boolean lsparse = sparse && (_inputs.get(0) instanceof CNodeData
&& _inputs.get(0).getVarname().startsWith("a")
&& !_inputs.get(0).isLiteral());
String var = createVarname();
- String tmp = _type.getTemplate(lsparse);
+// String tmp = _type.getTemplate(lsparse, api, lang);
+ String tmp = getLanguageTemplateClass(this, api).getTemplate(_type, lsparse);
+
tmp = tmp.replace("%TMP%", var);
for( int j=1; j<=3; j++ ) {
String varj = _inputs.get(j-1).getVarname();
@@ -186,4 +152,14 @@ public class CNodeTernary extends CNode
return super.equals(that)
&& _type == that._type;
}
+ @Override
+ public boolean isSupported(GeneratorAPI api) {
+ boolean is_supported = (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA);
+ int i = 0;
+ while(is_supported && i < _inputs.size()) {
+ CNode in = _inputs.get(i++);
+ is_supported = in.isSupported(api);
+ }
+ return is_supported;
+ }
}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTpl.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTpl.java
index 82187bb..2026eb3 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTpl.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTpl.java
@@ -24,13 +24,15 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
-
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
import org.apache.sysds.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
public abstract class CNodeTpl extends CNode implements Cloneable
{
private int _beginLine = -1;
-
+
+ protected GeneratorAPI api = GeneratorAPI.AUTO;
+
public CNodeTpl(ArrayList<CNode> inputs, CNode output ) {
if(inputs.size() < 1)
throw new RuntimeException("Cannot pass empty inputs to the CNodeTpl");
@@ -74,7 +76,7 @@ public abstract class CNodeTpl extends CNode implements Cloneable
}
public String codegen() {
- return codegen(false);
+ return codegen(false, GeneratorAPI.AUTO);
}
@Override
@@ -83,7 +85,7 @@ public abstract class CNodeTpl extends CNode implements Cloneable
public abstract SpoofOutputDimsType getOutputDimType();
public abstract String getTemplateInfo();
-
+
public abstract void renameInputs();
protected void renameInputs(ArrayList<CNode> inputs, int startIndex) {
@@ -232,4 +234,6 @@ public abstract class CNodeTpl extends CNode implements Cloneable
}
return -1;
}
+
+ public GeneratorAPI getGeneratorAPI() { return api; }
}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeUnary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeUnary.java
index ca571ea..2ff9054 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeUnary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeUnary.java
@@ -25,7 +25,7 @@ import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.sysds.common.Types.DataType;
import org.apache.sysds.runtime.util.UtilFunctions;
-
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
public class CNodeUnary extends CNode
{
@@ -47,108 +47,7 @@ public class CNodeUnary extends CNode
public static boolean contains(String value) {
return Arrays.stream(values()).anyMatch(ut -> ut.name().equals(value));
}
-
- public String getTemplate(boolean sparse) {
- switch( this ) {
- case ROW_SUMS:
- case ROW_SUMSQS:
- case ROW_MINS:
- case ROW_MAXS:
- case ROW_MEANS:
- case ROW_COUNTNNZS: {
- String vectName = StringUtils.capitalize(name().substring(4, name().length()-1).toLowerCase());
- return sparse ? " double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, alen, len);\n":
- " double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1%, %POS1%, %LEN%);\n";
- }
-
- case VECT_EXP:
- case VECT_POW2:
- case VECT_MULT2:
- case VECT_SQRT:
- case VECT_LOG:
- case VECT_ABS:
- case VECT_ROUND:
- case VECT_CEIL:
- case VECT_FLOOR:
- case VECT_SIGN:
- case VECT_SIN:
- case VECT_COS:
- case VECT_TAN:
- case VECT_ASIN:
- case VECT_ACOS:
- case VECT_ATAN:
- case VECT_SINH:
- case VECT_COSH:
- case VECT_TANH:
- case VECT_CUMSUM:
- case VECT_CUMMIN:
- case VECT_CUMMAX:
- case VECT_SPROP:
- case VECT_SIGMOID: {
- String vectName = getVectorPrimitiveName();
- return sparse ? " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, alen, len);\n" :
- " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %POS1%, %LEN%);\n";
- }
-
- case EXP:
- return " double %TMP% = FastMath.exp(%IN1%);\n";
- case LOOKUP_R:
- return sparse ?
- " double %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, 0);\n" :
- " double %TMP% = getValue(%IN1%, rix);\n";
- case LOOKUP_C:
- return " double %TMP% = getValue(%IN1%, n, 0, cix);\n";
- case LOOKUP_RC:
- return " double %TMP% = getValue(%IN1%, n, rix, cix);\n";
- case LOOKUP0:
- return " double %TMP% = %IN1%[0];\n";
- case POW2:
- return " double %TMP% = %IN1% * %IN1%;\n";
- case MULT2:
- return " double %TMP% = %IN1% + %IN1%;\n";
- case ABS:
- return " double %TMP% = Math.abs(%IN1%);\n";
- case SIN:
- return " double %TMP% = FastMath.sin(%IN1%);\n";
- case COS:
- return " double %TMP% = FastMath.cos(%IN1%);\n";
- case TAN:
- return " double %TMP% = FastMath.tan(%IN1%);\n";
- case ASIN:
- return " double %TMP% = FastMath.asin(%IN1%);\n";
- case ACOS:
- return " double %TMP% = FastMath.acos(%IN1%);\n";
- case ATAN:
- return " double %TMP% = Math.atan(%IN1%);\n";
- case SINH:
- return " double %TMP% = FastMath.sinh(%IN1%);\n";
- case COSH:
- return " double %TMP% = FastMath.cosh(%IN1%);\n";
- case TANH:
- return " double %TMP% = FastMath.tanh(%IN1%);\n";
- case SIGN:
- return " double %TMP% = FastMath.signum(%IN1%);\n";
- case SQRT:
- return " double %TMP% = Math.sqrt(%IN1%);\n";
- case LOG:
- return " double %TMP% = Math.log(%IN1%);\n";
- case ROUND:
- return " double %TMP% = Math.round(%IN1%);\n";
- case CEIL:
- return " double %TMP% = FastMath.ceil(%IN1%);\n";
- case FLOOR:
- return " double %TMP% = FastMath.floor(%IN1%);\n";
- case SPROP:
- return " double %TMP% = %IN1% * (1 - %IN1%);\n";
- case SIGMOID:
- return " double %TMP% = 1 / (1 + FastMath.exp(-%IN1%));\n";
- case LOG_NZ:
- return " double %TMP% = (%IN1%==0) ? 0 : Math.log(%IN1%);\n";
-
- default:
- throw new RuntimeException("Invalid unary type: "+this.toString());
- }
- }
+
public boolean isVectorScalarPrimitive() {
return this == VECT_EXP || this == VECT_POW2
|| this == VECT_MULT2 || this == VECT_SQRT
@@ -196,21 +95,21 @@ public class CNodeUnary extends CNode
}
@Override
- public String codegen(boolean sparse) {
+ public String codegen(boolean sparse, GeneratorAPI api) {
if( isGenerated() )
return "";
StringBuilder sb = new StringBuilder();
//generate children
- sb.append(_inputs.get(0).codegen(sparse));
+ sb.append(_inputs.get(0).codegen(sparse, api));
//generate unary operation
boolean lsparse = sparse && (_inputs.get(0) instanceof CNodeData
&& _inputs.get(0).getVarname().startsWith("a")
&& !_inputs.get(0).isLiteral());
String var = createVarname();
- String tmp = _type.getTemplate(lsparse);
+ String tmp = getLanguageTemplateClass(this, api).getTemplate(_type, lsparse);
tmp = tmp.replace("%TMP%", var);
//replace sparse and dense inputs
@@ -361,4 +260,14 @@ public class CNodeUnary extends CNode
return super.equals(that)
&& _type == that._type;
}
+ @Override
+ public boolean isSupported(GeneratorAPI api) {
+ boolean is_supported = (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA);
+ int i = 0;
+ while(is_supported && i < _inputs.size()) {
+ CNode in = _inputs.get(i++);
+ is_supported = in.isSupported(api);
+ }
+ return is_supported;
+ }
}
diff --git a/src/test/scripts/functions/codegen/SystemDS-config-codegen.xml b/src/main/java/org/apache/sysds/hops/codegen/cplan/CodeTemplate.java
similarity index 60%
copy from src/test/scripts/functions/codegen/SystemDS-config-codegen.xml
copy to src/main/java/org/apache/sysds/hops/codegen/cplan/CodeTemplate.java
index 1becb67..8a8a3be 100644
--- a/src/test/scripts/functions/codegen/SystemDS-config-codegen.xml
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CodeTemplate.java
@@ -1,4 +1,4 @@
-<!--
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -6,25 +6,31 @@
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
--->
-
-<root>
- <sysds.localtmpdir>/tmp/systemds</sysds.localtmpdir>
- <sysds.scratch>scratch_space</sysds.scratch>
- <sysds.optlevel>7</sysds.optlevel>
- <sysds.codegen.enabled>true</sysds.codegen.enabled>
- <sysds.codegen.plancache>true</sysds.codegen.plancache>
- <sysds.codegen.literals>1</sysds.codegen.literals>
-
- <!-- The number of theads for the spark instance artificially selected-->
- <sysds.local.spark.number.threads>16</sysds.local.spark.number.threads>
-</root>
\ No newline at end of file
+ */
+
+package org.apache.sysds.hops.codegen.cplan;
+
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+public interface CodeTemplate {
+
+ String getTemplate();
+
+ String getTemplate(CNodeUnary.UnaryType type, boolean sparse);
+
+ String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector,
+ boolean scalarInput);
+
+ String getTemplate(CNodeTernary.TernaryType type, boolean sparse);
+
+ String getTemplate(SpoofCellwise.CellType ct);
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Binary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Binary.java
new file mode 100644
index 0000000..8d78b7b
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Binary.java
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.cpp;
+
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision;
+
+public class Binary implements CodeTemplate {
+ @Override
+ public String getTemplate() {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(SpoofCellwise.CellType ct) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector,
+ boolean scalarInput) {
+
+ if(isSinglePrecision()) {
+ switch(type) {
+ case DOT_PRODUCT:
+ return sparseLhs ? " T %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" : " T %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+ case VECT_MATRIXMULT:
+ return sparseLhs ? " T[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" : " T[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+ case VECT_OUTERMULT_ADD:
+ return sparseLhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : sparseRhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2v%, %OUT%, %POS1%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : " LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
+
+ //vector-scalar-add operations
+ case VECT_MULT_ADD:
+ case VECT_DIV_ADD:
+ case VECT_MINUS_ADD:
+ case VECT_PLUS_ADD:
+ case VECT_POW_ADD:
+ case VECT_XOR_ADD:
+ case VECT_MIN_ADD:
+ case VECT_MAX_ADD:
+ case VECT_EQUAL_ADD:
+ case VECT_NOTEQUAL_ADD:
+ case VECT_LESS_ADD:
+ case VECT_LESSEQUAL_ADD:
+ case VECT_GREATER_ADD:
+ case VECT_GREATEREQUAL_ADD:
+ case VECT_CBIND_ADD: {
+ String vectName = type.getVectorPrimitiveName();
+ if(scalarVector)
+ return sparseLhs ? " LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" : " LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
+ else
+ return sparseLhs ? " LibSpoofPrimitives.vect" + vectName + "Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" : " LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
+ }
+
+ //vector-scalar operations
+ case VECT_MULT_SCALAR:
+ case VECT_DIV_SCALAR:
+ case VECT_MINUS_SCALAR:
+ case VECT_PLUS_SCALAR:
+ case VECT_POW_SCALAR:
+ case VECT_XOR_SCALAR:
+ case VECT_BITWAND_SCALAR:
+ case VECT_MIN_SCALAR:
+ case VECT_MAX_SCALAR:
+ case VECT_EQUAL_SCALAR:
+ case VECT_NOTEQUAL_SCALAR:
+ case VECT_LESS_SCALAR:
+ case VECT_LESSEQUAL_SCALAR:
+ case VECT_GREATER_SCALAR:
+ case VECT_GREATEREQUAL_SCALAR: {
+ String vectName = type.getVectorPrimitiveName();
+ if(scalarVector)
+ return sparseRhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS2%, %LEN%);\n";
+ else
+ return sparseLhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
+ }
+
+ case VECT_CBIND:
+ if(scalarInput)
+ return " T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%);\n";
+ else
+ return sparseLhs ? " T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : " T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%, %POS1%, %LEN%);\n";
+
+ //vector-vector operations
+ case VECT_MULT:
+ case VECT_DIV:
+ case VECT_MINUS:
+ case VECT_PLUS:
+ case VECT_XOR:
+ case VECT_BITWAND:
+ case VECT_BIASADD:
+ case VECT_BIASMULT:
+ case VECT_MIN:
+ case VECT_MAX:
+ case VECT_EQUAL:
+ case VECT_NOTEQUAL:
+ case VECT_LESS:
+ case VECT_LESSEQUAL:
+ case VECT_GREATER:
+ case VECT_GREATEREQUAL: {
+ String vectName = type.getVectorPrimitiveName();
+ return sparseLhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN%);\n" : sparseRhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2v%, %POS1%, %IN2i%, %POS2%, alen, %LEN%);\n" : " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+ }
+
+ //scalar-scalar operations
+ case MULT:
+ return " T %TMP% = %IN1% * %IN2%;\n";
+ case DIV:
+ return " T %TMP% = %IN1% / %IN2%;\n";
+ case PLUS:
+ return " T %TMP% = %IN1% + %IN2%;\n";
+ case MINUS:
+ return " T %TMP% = %IN1% - %IN2%;\n";
+ case MODULUS:
+ return " T %TMP% = modulus(%IN1%, %IN2%);\n";
+ case INTDIV:
+ return " T %TMP% = intDiv(%IN1%, %IN2%);\n";
+ case LESS:
+ return " T %TMP% = (%IN1% < %IN2%) ? 1 : 0;\n";
+ case LESSEQUAL:
+ return " T %TMP% = (%IN1% <= %IN2%) ? 1 : 0;\n";
+ case GREATER:
+ return " T %TMP% = (%IN1% > %IN2%) ? 1 : 0;\n";
+ case GREATEREQUAL:
+ return " T %TMP% = (%IN1% >= %IN2%) ? 1 : 0;\n";
+ case EQUAL:
+ return " T %TMP% = (%IN1% == %IN2%) ? 1 : 0;\n";
+ case NOTEQUAL:
+ return " T %TMP% = (%IN1% != %IN2%) ? 1 : 0;\n";
+
+ case MIN:
+ return " T %TMP% = fminf(%IN1%, %IN2%);\n";
+ case MAX:
+ return " T %TMP% = fmaxf(%IN1%, %IN2%);\n";
+ case LOG:
+ return " T %TMP% = logf(%IN1%)/Math.log(%IN2%);\n";
+ case LOG_NZ:
+ return " T %TMP% = (%IN1% == 0) ? 0 : logf(%IN1%) / logf(%IN2%);\n";
+ case POW:
+ return " T %TMP% = powf(%IN1%, %IN2%);\n";
+ case MINUS1_MULT:
+ return " T %TMP% = 1 - %IN1% * %IN2%;\n";
+ case MINUS_NZ:
+ return " T %TMP% = (%IN1% != 0) ? %IN1% - %IN2% : 0;\n";
+ case XOR:
+ return " T %TMP% = ( (%IN1% != 0) != (%IN2% != 0) ) ? 1.0f : 0.0f;\n";
+ case BITWAND:
+ return " T %TMP% = bwAnd(%IN1%, %IN2%);\n";
+ case SEQ_RIX:
+ return " T %TMP% = %IN1% + grix * %IN2%;\n"; //0-based global rix
+
+ default:
+ throw new RuntimeException("Invalid binary type: " + this.toString());
+ }
+ }
+ else {
+ switch(type) {
+ case DOT_PRODUCT:
+ return sparseLhs ? " T %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" : " T %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+ case VECT_MATRIXMULT:
+ return sparseLhs ? " T[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" : " T[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+ case VECT_OUTERMULT_ADD:
+ return sparseLhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : sparseRhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2v%, %OUT%, %POS1%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : " LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
+
+ //vector-scalar-add operations
+ case VECT_MULT_ADD:
+ case VECT_DIV_ADD:
+ case VECT_MINUS_ADD:
+ case VECT_PLUS_ADD:
+ case VECT_POW_ADD:
+ case VECT_XOR_ADD:
+ case VECT_MIN_ADD:
+ case VECT_MAX_ADD:
+ case VECT_EQUAL_ADD:
+ case VECT_NOTEQUAL_ADD:
+ case VECT_LESS_ADD:
+ case VECT_LESSEQUAL_ADD:
+ case VECT_GREATER_ADD:
+ case VECT_GREATEREQUAL_ADD:
+ case VECT_CBIND_ADD: {
+ String vectName = type.getVectorPrimitiveName();
+ if(scalarVector)
+ return sparseLhs ? " LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" : " LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
+ else
+ return sparseLhs ? " LibSpoofPrimitives.vect" + vectName + "Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" : " LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
+ }
+
+ //vector-scalar operations
+ case VECT_MULT_SCALAR:
+ case VECT_DIV_SCALAR:
+ case VECT_MINUS_SCALAR:
+ case VECT_PLUS_SCALAR:
+ case VECT_POW_SCALAR:
+ case VECT_XOR_SCALAR:
+ case VECT_BITWAND_SCALAR:
+ case VECT_MIN_SCALAR:
+ case VECT_MAX_SCALAR:
+ case VECT_EQUAL_SCALAR:
+ case VECT_NOTEQUAL_SCALAR:
+ case VECT_LESS_SCALAR:
+ case VECT_LESSEQUAL_SCALAR:
+ case VECT_GREATER_SCALAR:
+ case VECT_GREATEREQUAL_SCALAR: {
+ String vectName = type.getVectorPrimitiveName();
+ if(scalarVector)
+ return sparseRhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS2%, %LEN%);\n";
+ else
+ return sparseLhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
+ }
+
+ case VECT_CBIND:
+ if(scalarInput)
+ return " T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%);\n";
+ else
+ return sparseLhs ? " T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : " T[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%, %POS1%, %LEN%);\n";
+
+ //vector-vector operations
+ case VECT_MULT:
+ case VECT_DIV:
+ case VECT_MINUS:
+ case VECT_PLUS:
+ case VECT_XOR:
+ case VECT_BITWAND:
+ case VECT_BIASADD:
+ case VECT_BIASMULT:
+ case VECT_MIN:
+ case VECT_MAX:
+ case VECT_EQUAL:
+ case VECT_NOTEQUAL:
+ case VECT_LESS:
+ case VECT_LESSEQUAL:
+ case VECT_GREATER:
+ case VECT_GREATEREQUAL: {
+ String vectName = type.getVectorPrimitiveName();
+ return sparseLhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN%);\n" : sparseRhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2v%, %POS1%, %IN2i%, %POS2%, alen, %LEN%);\n" : " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+ }
+
+ //scalar-scalar operations
+ case MULT:
+ return " T %TMP% = %IN1% * %IN2%;\n";
+ case DIV:
+ return " T %TMP% = %IN1% / %IN2%;\n";
+ case PLUS:
+ return " T %TMP% = %IN1% + %IN2%;\n";
+ case MINUS:
+ return " T %TMP% = %IN1% - %IN2%;\n";
+ case MODULUS:
+ return " T %TMP% = modulus(%IN1%, %IN2%);\n";
+ case INTDIV:
+ return " T %TMP% = intDiv(%IN1%, %IN2%);\n";
+ case LESS:
+ return " T %TMP% = (%IN1% < %IN2%) ? 1.0 : 0.0;\n";
+ case LESSEQUAL:
+ return " T %TMP% = (%IN1% <= %IN2%) ? 1.0 : 0.0;\n";
+ case GREATER:
+ return " T %TMP% = (%IN1% > (%IN2% + EPSILON)) ? 1.0 : 0.0;\n";
+ case GREATEREQUAL:
+ return " T %TMP% = (%IN1% >= %IN2%) ? 1.0 : 0.0;\n";
+ case EQUAL:
+ return " T %TMP% = (%IN1% == %IN2%) ? 1.0 : 0.0;\n";
+ case NOTEQUAL:
+ return " T %TMP% = (%IN1% != %IN2%) ? 1.0 : 0.0;\n";
+
+ case MIN:
+ return " T %TMP% = min(%IN1%, %IN2%);\n";
+ case MAX:
+ return " T %TMP% = max(%IN1%, %IN2%);\n";
+ case LOG:
+ return " T %TMP% = log(%IN1%)/Math.log(%IN2%);\n";
+ case LOG_NZ:
+ return " T %TMP% = (%IN1% == 0) ? 0 : log(%IN1%) / log(%IN2%);\n";
+ case POW:
+ return " T %TMP% = pow(%IN1%, %IN2%);\n";
+ case MINUS1_MULT:
+ return " T %TMP% = 1 - %IN1% * %IN2%;\n";
+ case MINUS_NZ:
+ return " T %TMP% = (%IN1% != 0) ? %IN1% - %IN2% : 0;\n";
+ case XOR:
+// return " T %TMP% = ( (%IN1% != 0.0) != (%IN2% != 0.0) ) ? 1.0 : 0.0;\n";
+ return " T %TMP% = ( (%IN1% < EPSILON) != (%IN2% < EPSILON) ) ? 1.0 : 0.0;\n";
+ case BITWAND:
+ return " T %TMP% = bwAnd(%IN1%, %IN2%);\n";
+ case SEQ_RIX:
+ return " T %TMP% = %IN1% + grix * %IN2%;\n"; //0-based global rix
+
+ default:
+ throw new RuntimeException("Invalid binary type: " + this.toString());
+ }
+ }
+ }
+
+ @Override
+ public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/CellWise.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/CellWise.java
new file mode 100644
index 0000000..f76f3ec
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/CellWise.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.cpp;
+
+import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.conf.DMLConfig;
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+import org.apache.sysds.runtime.io.IOUtilFunctions;
+
+import java.io.*;
+import java.util.stream.Collectors;
+
+// ToDo: clean code template and load from file
+public class CellWise implements CodeTemplate {
+
+ private static final String TEMPLATE_PATH = "/cuda/spoof/cellwise.cu";
+
+ @Override
+ public String getTemplate() {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(SpoofCellwise.CellType ct) {
+ try {
+ // Change prefix to the code template file if running from jar. File were extracted to a temporary
+ // directory in that case. By default we load the template from the source tree.
+ if(CellWise.class.getProtectionDomain().getCodeSource().getLocation().getPath().contains(".jar"))
+ return(IOUtilFunctions.toString(new FileInputStream(ConfigurationManager.getDMLConfig()
+ .getTextValue(DMLConfig.LOCAL_TMP_DIR) + TEMPLATE_PATH)));
+ else
+ return IOUtilFunctions.toString(new FileInputStream(System.getProperty("user.dir") +
+ "/src/main" + TEMPLATE_PATH));
+ }
+ catch(IOException e) {
+ System.out.println(e.getMessage());
+ return null;
+ }
+ }
+
+ @Override
+ public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Ternary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Ternary.java
new file mode 100644
index 0000000..3edfcea
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Ternary.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.cpp;
+
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision;
+
+public class Ternary implements CodeTemplate {
+
+ @Override
+ public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+ if(isSinglePrecision()) {
+ switch (type) {
+ case PLUS_MULT:
+ return " T %TMP% = %IN1% + %IN2% * %IN3%;\n";
+
+ case MINUS_MULT:
+ return " T %TMP% = %IN1% - %IN2% * %IN3%;\n";
+
+ case BIASADD:
+ return " T %TMP% = %IN1% + getValue(%IN2%, cix/%IN3%);\n";
+
+ case BIASMULT:
+ return " T %TMP% = %IN1% * getValue(%IN2%, cix/%IN3%);\n";
+
+ case REPLACE:
+ return " T %TMP% = (%IN1% == %IN2% || (isnan(%IN1%) "
+ + "&& isnan(%IN2%))) ? %IN3% : %IN1%;\n";
+
+ case REPLACE_NAN:
+ return " T %TMP% = isnan(%IN1%) ? %IN3% : %IN1%;\n";
+
+ case IFELSE:
+ return " T %TMP% = (%IN1% != 0) ? %IN2% : %IN3%;\n";
+
+ case LOOKUP_RC1:
+ return sparse ?
+ " T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, %IN3%-1);\n" :
+ " T %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n";
+
+ case LOOKUP_RVECT1:
+ return " T[] %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1);\n";
+
+ default:
+ throw new RuntimeException("Invalid ternary type: " + this.toString());
+ }
+ }
+ else {
+ switch (type) {
+ case PLUS_MULT:
+ return " T %TMP% = %IN1% + %IN2% * %IN3%;\n";
+
+ case MINUS_MULT:
+ return " T %TMP% = %IN1% - %IN2% * %IN3%;\n";
+
+ case BIASADD:
+ return " T %TMP% = %IN1% + getValue(%IN2%, cix/%IN3%);\n";
+
+ case BIASMULT:
+ return " T %TMP% = %IN1% * getValue(%IN2%, cix/%IN3%);\n";
+
+ case REPLACE:
+ return " T %TMP% = (%IN1% == %IN2% || (isnan(%IN1%) "
+ + "&& isnan(%IN2%))) ? %IN3% : %IN1%;\n";
+
+ case REPLACE_NAN:
+ return " T %TMP% = isnan(%IN1%) ? %IN3% : %IN1%;\n";
+
+ case IFELSE:
+ return " T %TMP% = (%IN1% != 0) ? %IN2% : %IN3%;\n";
+
+ case LOOKUP_RC1:
+ return sparse ?
+ " T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, %IN3%-1);\n" :
+ " T %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n";
+
+ case LOOKUP_RVECT1:
+ return " T[] %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1);\n";
+
+ default:
+ throw new RuntimeException("Invalid ternary type: "+this.toString());
+ }
+
+ }
+ }
+
+ @Override
+ public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector,
+ boolean scalarInput) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate() {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(SpoofCellwise.CellType ct) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Unary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Unary.java
new file mode 100644
index 0000000..ed18779
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cpp/Unary.java
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.cpp;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision;
+
+public class Unary implements CodeTemplate {
+ @Override
+ public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+ if(isSinglePrecision()) {
+ switch( type ) {
+ case ROW_SUMS:
+ case ROW_SUMSQS:
+ case ROW_MINS:
+ case ROW_MAXS:
+ case ROW_MEANS:
+ case ROW_COUNTNNZS: {
+ String vectName = StringUtils.capitalize(type.name().substring(4, type.name().length()-1).toLowerCase());
+ return sparse ? " T %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, alen, len);\n":
+ " T %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1%, %POS1%, %LEN%);\n";
+ }
+
+ case VECT_EXP:
+ case VECT_POW2:
+ case VECT_MULT2:
+ case VECT_SQRT:
+ case VECT_LOG:
+ case VECT_ABS:
+ case VECT_ROUND:
+ case VECT_CEIL:
+ case VECT_FLOOR:
+ case VECT_SIGN:
+ case VECT_SIN:
+ case VECT_COS:
+ case VECT_TAN:
+ case VECT_ASIN:
+ case VECT_ACOS:
+ case VECT_ATAN:
+ case VECT_SINH:
+ case VECT_COSH:
+ case VECT_TANH:
+ case VECT_CUMSUM:
+ case VECT_CUMMIN:
+ case VECT_CUMMAX:
+ case VECT_SPROP:
+ case VECT_SIGMOID: {
+ String vectName = type.getVectorPrimitiveName();
+ return sparse ? " T[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, alen, len);\n" :
+ " T[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %POS1%, %LEN%);\n";
+ }
+
+ case EXP:
+ return " T %TMP% = expf(%IN1%);\n";
+ case LOOKUP_R:
+ return sparse ?
+ " T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, 0);\n" :
+ " T %TMP% = getValue(%IN1%, rix);\n";
+ case LOOKUP_C:
+ return " T %TMP% = getValue(%IN1%, n, 0, cix);\n";
+ case LOOKUP_RC:
+ return " T %TMP% = getValue(%IN1%, n, rix, cix);\n";
+ case LOOKUP0:
+ return " T %TMP% = %IN1%[0];\n";
+ case POW2:
+ return " T %TMP% = %IN1% * %IN1%;\n";
+ case MULT2:
+ return " T %TMP% = %IN1% + %IN1%;\n";
+ case ABS:
+ return " T %TMP% = fabsf(%IN1%);\n";
+ case SIN:
+ return " T %TMP% = sinf(%IN1%);\n";
+ case COS:
+ return " T %TMP% = cosf(%IN1%);\n";
+ case TAN:
+ return " T %TMP% = tanf(%IN1%);\n";
+ case ASIN:
+ return " T %TMP% = asinf(%IN1%);\n";
+ case ACOS:
+ return " T %TMP% = acosf(%IN1%);\n";
+ case ATAN:
+ return " T %TMP% = atanf(%IN1%);\n";
+ case SINH:
+ return " T %TMP% = sinhf(%IN1%);\n";
+ case COSH:
+ return " T %TMP% = coshf(%IN1%);\n";
+ case TANH:
+ return " T %TMP% = tanhf(%IN1%);\n";
+ case SIGN:
+ return " T %TMP% = signbit(%IN1%) == 0 ? 1.0f : -1.0f;\n";
+ case SQRT:
+ return " T %TMP% = sqrtf(%IN1%);\n";
+ case LOG:
+ return " T %TMP% = logf(%IN1%);\n";
+ case ROUND:
+ return " T %TMP% = roundf(%IN1%);\n";
+ case CEIL:
+ return " T %TMP% = ceilf(%IN1%);\n";
+ case FLOOR:
+ return " T %TMP% = floorf(%IN1%);\n";
+ case SPROP:
+ return " T %TMP% = %IN1% * (1 - %IN1%);\n";
+ case SIGMOID:
+ return " T %TMP% = 1 / (1 + expf(-%IN1%));\n";
+ case LOG_NZ:
+ return " T %TMP% = (%IN1%==0) ? 0 : logf(%IN1%);\n";
+
+ default:
+ throw new RuntimeException("Invalid unary type: "+this.toString());
+ }
+ }
+ else { /* double precision */
+ switch( type ) {
+ case ROW_SUMS:
+ case ROW_SUMSQS:
+ case ROW_MINS:
+ case ROW_MAXS:
+ case ROW_MEANS:
+ case ROW_COUNTNNZS: {
+ String vectName = StringUtils.capitalize(type.name().substring(4, type.name().length()-1).toLowerCase());
+ return sparse ? " T %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, alen, len);\n":
+ " T %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1%, %POS1%, %LEN%);\n";
+ }
+
+ case VECT_EXP:
+ case VECT_POW2:
+ case VECT_MULT2:
+ case VECT_SQRT:
+ case VECT_LOG:
+ case VECT_ABS:
+ case VECT_ROUND:
+ case VECT_CEIL:
+ case VECT_FLOOR:
+ case VECT_SIGN:
+ case VECT_SIN:
+ case VECT_COS:
+ case VECT_TAN:
+ case VECT_ASIN:
+ case VECT_ACOS:
+ case VECT_ATAN:
+ case VECT_SINH:
+ case VECT_COSH:
+ case VECT_TANH:
+ case VECT_CUMSUM:
+ case VECT_CUMMIN:
+ case VECT_CUMMAX:
+ case VECT_SPROP:
+ case VECT_SIGMOID: {
+ String vectName = type.getVectorPrimitiveName();
+ return sparse ? " T[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, alen, len);\n" :
+ " T[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %POS1%, %LEN%);\n";
+ }
+
+ case EXP:
+ return " T %TMP% = exp(%IN1%);\n";
+ case LOOKUP_R:
+ return sparse ?
+ " T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, 0);\n" :
+ " T %TMP% = getValue(%IN1%, rix);\n";
+ case LOOKUP_C:
+ return " T %TMP% = getValue(%IN1%, n, 0, cix);\n";
+ case LOOKUP_RC:
+ return " T %TMP% = getValue(%IN1%, n, rix, cix);\n";
+ case LOOKUP0:
+ return " T %TMP% = %IN1%[0];\n";
+ case POW2:
+ return " T %TMP% = %IN1% * %IN1%;\n";
+ case MULT2:
+ return " T %TMP% = %IN1% + %IN1%;\n";
+ case ABS:
+ return " T %TMP% = fabs(%IN1%);\n";
+ case SIN:
+ return " T %TMP% = sin(%IN1%);\n";
+ case COS:
+ return " T %TMP% = cos(%IN1%);\n";
+ case TAN:
+ return " T %TMP% = tan(%IN1%);\n";
+ case ASIN:
+ return " T %TMP% = asin(%IN1%);\n";
+ case ACOS:
+ return " T %TMP% = acos(%IN1%);\n";
+ case ATAN:
+ return " T %TMP% = atan(%IN1%);\n";
+ case SINH:
+ return " T %TMP% = sinh(%IN1%);\n";
+ case COSH:
+ return " T %TMP% = cosh(%IN1%);\n";
+ case TANH:
+ return " T %TMP% = tanh(%IN1%);\n";
+ case SIGN:
+ return " T %TMP% = signbit(%IN1%) == 0 ? 1.0f : -1.0f;\n";
+ case SQRT:
+ return " T %TMP% = sqrt(%IN1%);\n";
+ case LOG:
+ return " T %TMP% = log(%IN1%);\n";
+ case ROUND:
+ return " T %TMP% = round(%IN1%);\n";
+ case CEIL:
+ return " T %TMP% = ceil(%IN1%);\n";
+ case FLOOR:
+ return " T %TMP% = floor(%IN1%);\n";
+ case SPROP:
+ return " T %TMP% = %IN1% * (1 - %IN1%);\n";
+ case SIGMOID:
+ return " T %TMP% = 1 / (1 + exp(-%IN1%));\n";
+ case LOG_NZ:
+ return " T %TMP% = (%IN1%==0) ? 0 : log(%IN1%);\n";
+
+ default:
+ throw new RuntimeException("Invalid unary type: "+this.toString());
+ }
+
+ }
+ }
+
+ @Override
+ public String getTemplate() {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(SpoofCellwise.CellType ct) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Binary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Binary.java
new file mode 100644
index 0000000..39b0f6f
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Binary.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.java;
+
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary.BinType;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+public class Binary implements CodeTemplate {
+ @Override
+ public String getTemplate(BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector,
+ boolean scalarInput) {
+
+ switch (type) {
+ case DOT_PRODUCT:
+ return sparseLhs ? " double %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" :
+ " double %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+ case VECT_MATRIXMULT:
+ return sparseLhs ? " double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" :
+ " double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+ case VECT_OUTERMULT_ADD:
+ return sparseLhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
+ sparseRhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2v%, %OUT%, %POS1%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
+ " LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
+
+ //vector-scalar-add operations
+ case VECT_MULT_ADD:
+ case VECT_DIV_ADD:
+ case VECT_MINUS_ADD:
+ case VECT_PLUS_ADD:
+ case VECT_POW_ADD:
+ case VECT_XOR_ADD:
+ case VECT_MIN_ADD:
+ case VECT_MAX_ADD:
+ case VECT_EQUAL_ADD:
+ case VECT_NOTEQUAL_ADD:
+ case VECT_LESS_ADD:
+ case VECT_LESSEQUAL_ADD:
+ case VECT_GREATER_ADD:
+ case VECT_GREATEREQUAL_ADD:
+ case VECT_CBIND_ADD: {
+ String vectName = type.getVectorPrimitiveName();
+ if( scalarVector )
+ return sparseLhs ? " LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" :
+ " LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
+ else
+ return sparseLhs ? " LibSpoofPrimitives.vect"+vectName+"Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" :
+ " LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
+ }
+
+ //vector-scalar operations
+ case VECT_MULT_SCALAR:
+ case VECT_DIV_SCALAR:
+ case VECT_MINUS_SCALAR:
+ case VECT_PLUS_SCALAR:
+ case VECT_POW_SCALAR:
+ case VECT_XOR_SCALAR:
+ case VECT_BITWAND_SCALAR:
+ case VECT_MIN_SCALAR:
+ case VECT_MAX_SCALAR:
+ case VECT_EQUAL_SCALAR:
+ case VECT_NOTEQUAL_SCALAR:
+ case VECT_LESS_SCALAR:
+ case VECT_LESSEQUAL_SCALAR:
+ case VECT_GREATER_SCALAR:
+ case VECT_GREATEREQUAL_SCALAR: {
+ String vectName = type.getVectorPrimitiveName();
+ if( scalarVector )
+ return sparseRhs ? " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" :
+ " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS2%, %LEN%);\n";
+ else
+ return sparseLhs ? " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" :
+ " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
+ }
+
+ case VECT_CBIND:
+ if( scalarInput )
+ return " double[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%);\n";
+ else
+ return sparseLhs ?
+ " double[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" :
+ " double[] %TMP% = LibSpoofPrimitives.vectCbindWrite(%IN1%, %IN2%, %POS1%, %LEN%);\n";
+
+ //vector-vector operations
+ case VECT_MULT:
+ case VECT_DIV:
+ case VECT_MINUS:
+ case VECT_PLUS:
+ case VECT_XOR:
+ case VECT_BITWAND:
+ case VECT_BIASADD:
+ case VECT_BIASMULT:
+ case VECT_MIN:
+ case VECT_MAX:
+ case VECT_EQUAL:
+ case VECT_NOTEQUAL:
+ case VECT_LESS:
+ case VECT_LESSEQUAL:
+ case VECT_GREATER:
+ case VECT_GREATEREQUAL: {
+ String vectName = type.getVectorPrimitiveName();
+ return sparseLhs ?
+ " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN%);\n" :
+ sparseRhs ?
+ " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %POS1%, %IN2i%, %POS2%, alen, %LEN%);\n" :
+ " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+ }
+
+ //scalar-scalar operations
+ case MULT:
+ return " double %TMP% = %IN1% * %IN2%;\n";
+
+ case DIV:
+ return " double %TMP% = %IN1% / %IN2%;\n";
+ case PLUS:
+ return " double %TMP% = %IN1% + %IN2%;\n";
+ case MINUS:
+ return " double %TMP% = %IN1% - %IN2%;\n";
+ case MODULUS:
+ return " double %TMP% = LibSpoofPrimitives.mod(%IN1%, %IN2%);\n";
+ case INTDIV:
+ return " double %TMP% = LibSpoofPrimitives.intDiv(%IN1%, %IN2%);\n";
+ case LESS:
+ return " double %TMP% = (%IN1% < %IN2%) ? 1 : 0;\n";
+ case LESSEQUAL:
+ return " double %TMP% = (%IN1% <= %IN2%) ? 1 : 0;\n";
+ case GREATER:
+ return " double %TMP% = (%IN1% > %IN2%) ? 1 : 0;\n";
+ case GREATEREQUAL:
+ return " double %TMP% = (%IN1% >= %IN2%) ? 1 : 0;\n";
+ case EQUAL:
+ return " double %TMP% = (%IN1% == %IN2%) ? 1 : 0;\n";
+ case NOTEQUAL:
+ return " double %TMP% = (%IN1% != %IN2%) ? 1 : 0;\n";
+
+ case MIN:
+ return " double %TMP% = Math.min(%IN1%, %IN2%);\n";
+ case MAX:
+ return " double %TMP% = Math.max(%IN1%, %IN2%);\n";
+ case LOG:
+ return " double %TMP% = Math.log(%IN1%)/Math.log(%IN2%);\n";
+ case LOG_NZ:
+ return " double %TMP% = (%IN1% == 0) ? 0 : Math.log(%IN1%)/Math.log(%IN2%);\n";
+ case POW:
+ return " double %TMP% = Math.pow(%IN1%, %IN2%);\n";
+ case MINUS1_MULT:
+ return " double %TMP% = 1 - %IN1% * %IN2%;\n";
+ case MINUS_NZ:
+ return " double %TMP% = (%IN1% != 0) ? %IN1% - %IN2% : 0;\n";
+ case XOR:
+ return " double %TMP% = ( (%IN1% != 0) != (%IN2% != 0) ) ? 1 : 0;\n";
+ case BITWAND:
+ return " double %TMP% = LibSpoofPrimitives.bwAnd(%IN1%, %IN2%);\n";
+ case SEQ_RIX:
+ return " double %TMP% = %IN1% + grix * %IN2%;\n"; //0-based global rix
+
+ default:
+ throw new RuntimeException("Invalid binary type: "+this.toString());
+ }
+ }
+
+ @Override
+ public String getTemplate() {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(SpoofCellwise.CellType ct) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/CellWise.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/CellWise.java
new file mode 100644
index 0000000..319c872
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/CellWise.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.java;
+
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+public class CellWise implements CodeTemplate {
+ public static final String TEMPLATE =
+ "package codegen;\n"
+ + "import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;\n"
+ + "import org.apache.sysds.runtime.codegen.SpoofCellwise;\n"
+ + "import org.apache.sysds.runtime.codegen.SpoofCellwise.AggOp;\n"
+ + "import org.apache.sysds.runtime.codegen.SpoofCellwise.CellType;\n"
+ + "import org.apache.sysds.runtime.codegen.SpoofOperator.SideInput;\n"
+ + "import org.apache.commons.math3.util.FastMath;\n"
+ + "\n"
+ + "public final class %TMP% extends SpoofCellwise {\n"
+ + " public %TMP%() {\n"
+ + " super(CellType.%TYPE%, %SPARSE_SAFE%, %SEQ%, %AGG_OP_NAME%);\n"
+ + " }\n"
+ + " protected double genexec(double a, SideInput[] b, double[] scalars, int m, int n, long grix, int rix, int cix) { \n"
+ + "%BODY_dense%"
+ + " return %OUT%;\n"
+ + " }\n"
+ + "}\n";
+
+ @Override
+ public String getTemplate() {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(SpoofCellwise.CellType ct) {
+ switch(ct) {
+ case NO_AGG:
+ case FULL_AGG:
+ case ROW_AGG:
+ case COL_AGG:
+ default:
+ return TEMPLATE;
+ }
+ }
+
+ @Override
+ public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Ternary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Ternary.java
new file mode 100644
index 0000000..af48d05
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Ternary.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.java;
+
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+public class Ternary implements CodeTemplate {
+
+ @Override
+ public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+ switch (type) {
+ case PLUS_MULT:
+ return " double %TMP% = %IN1% + %IN2% * %IN3%;\n";
+
+ case MINUS_MULT:
+ return " double %TMP% = %IN1% - %IN2% * %IN3%;\n";
+
+ case BIASADD:
+ return " double %TMP% = %IN1% + getValue(%IN2%, cix/%IN3%);\n";
+
+ case BIASMULT:
+ return " double %TMP% = %IN1% * getValue(%IN2%, cix/%IN3%);\n";
+
+ case REPLACE:
+ return " double %TMP% = (%IN1% == %IN2% || (Double.isNaN(%IN1%) "
+ + "&& Double.isNaN(%IN2%))) ? %IN3% : %IN1%;\n";
+
+ case REPLACE_NAN:
+ return " double %TMP% = Double.isNaN(%IN1%) ? %IN3% : %IN1%;\n";
+
+ case IFELSE:
+ return " double %TMP% = (%IN1% != 0) ? %IN2% : %IN3%;\n";
+
+ case LOOKUP_RC1:
+ return sparse ?
+ " double %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, %IN3%-1);\n" :
+ " double %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n";
+
+ case LOOKUP_RVECT1:
+ return " double[] %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1);\n";
+
+ default:
+ throw new RuntimeException("Invalid ternary type: "+this.toString());
+ }
+ }
+
+ @Override
+ public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector,
+ boolean scalarInput) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate() {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(SpoofCellwise.CellType ct) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Unary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Unary.java
new file mode 100644
index 0000000..7071e08
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Unary.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.codegen.cplan.java;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysds.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysds.hops.codegen.cplan.CNodeUnary.UnaryType;
+import org.apache.sysds.hops.codegen.cplan.CodeTemplate;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+
+public class Unary implements CodeTemplate {
+ @Override
+ public String getTemplate(UnaryType type, boolean sparse) {
+ switch( type ) {
+ case ROW_SUMS:
+ case ROW_SUMSQS:
+ case ROW_MINS:
+ case ROW_MAXS:
+ case ROW_MEANS:
+ case ROW_COUNTNNZS: {
+ String vectName = StringUtils.capitalize(type.name().substring(4, type.name().length()-1).toLowerCase());
+ return sparse ? " double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, alen, len);\n":
+ " double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1%, %POS1%, %LEN%);\n";
+ }
+
+ case VECT_EXP:
+ case VECT_POW2:
+ case VECT_MULT2:
+ case VECT_SQRT:
+ case VECT_LOG:
+ case VECT_ABS:
+ case VECT_ROUND:
+ case VECT_CEIL:
+ case VECT_FLOOR:
+ case VECT_SIGN:
+ case VECT_SIN:
+ case VECT_COS:
+ case VECT_TAN:
+ case VECT_ASIN:
+ case VECT_ACOS:
+ case VECT_ATAN:
+ case VECT_SINH:
+ case VECT_COSH:
+ case VECT_TANH:
+ case VECT_CUMSUM:
+ case VECT_CUMMIN:
+ case VECT_CUMMAX:
+ case VECT_SPROP:
+ case VECT_SIGMOID: {
+ String vectName = type.getVectorPrimitiveName();
+ return sparse ? " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, alen, len);\n" :
+ " double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %POS1%, %LEN%);\n";
+ }
+
+ case EXP:
+ return " double %TMP% = FastMath.exp(%IN1%);\n";
+ case LOOKUP_R:
+ return sparse ?
+ " double %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, 0);\n" :
+ " double %TMP% = getValue(%IN1%, rix);\n";
+ case LOOKUP_C:
+ return " double %TMP% = getValue(%IN1%, n, 0, cix);\n";
+ case LOOKUP_RC:
+ return " double %TMP% = getValue(%IN1%, n, rix, cix);\n";
+ case LOOKUP0:
+ return " double %TMP% = %IN1%[0];\n";
+ case POW2:
+ return " double %TMP% = %IN1% * %IN1%;\n";
+ case MULT2:
+ return " double %TMP% = %IN1% + %IN1%;\n";
+ case ABS:
+ return " double %TMP% = Math.abs(%IN1%);\n";
+ case SIN:
+ return " double %TMP% = FastMath.sin(%IN1%);\n";
+ case COS:
+ return " double %TMP% = FastMath.cos(%IN1%);\n";
+ case TAN:
+ return " double %TMP% = FastMath.tan(%IN1%);\n";
+ case ASIN:
+ return " double %TMP% = FastMath.asin(%IN1%);\n";
+ case ACOS:
+ return " double %TMP% = FastMath.acos(%IN1%);\n";
+ case ATAN:
+ return " double %TMP% = Math.atan(%IN1%);\n";
+ case SINH:
+ return " double %TMP% = FastMath.sinh(%IN1%);\n";
+ case COSH:
+ return " double %TMP% = FastMath.cosh(%IN1%);\n";
+ case TANH:
+ return " double %TMP% = FastMath.tanh(%IN1%);\n";
+ case SIGN:
+ return " double %TMP% = FastMath.signum(%IN1%);\n";
+ case SQRT:
+ return " double %TMP% = Math.sqrt(%IN1%);\n";
+ case LOG:
+ return " double %TMP% = Math.log(%IN1%);\n";
+ case ROUND:
+ return " double %TMP% = Math.round(%IN1%);\n";
+ case CEIL:
+ return " double %TMP% = FastMath.ceil(%IN1%);\n";
+ case FLOOR:
+ return " double %TMP% = FastMath.floor(%IN1%);\n";
+ case SPROP:
+ return " double %TMP% = %IN1% * (1 - %IN1%);\n";
+ case SIGMOID:
+ return " double %TMP% = 1 / (1 + FastMath.exp(-%IN1%));\n";
+ case LOG_NZ:
+ return " double %TMP% = (%IN1%==0) ? 0 : Math.log(%IN1%);\n";
+
+ default:
+ throw new RuntimeException("Invalid unary type: "+this.toString());
+ }
+ }
+
+ @Override
+ public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate() {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(SpoofCellwise.CellType ct) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+
+ @Override
+ public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
+ throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName());
+ }
+}
diff --git a/src/main/java/org/apache/sysds/lops/SpoofFused.java b/src/main/java/org/apache/sysds/lops/SpoofFused.java
index bd605f8..0795f8f 100644
--- a/src/main/java/org/apache/sysds/lops/SpoofFused.java
+++ b/src/main/java/org/apache/sysds/lops/SpoofFused.java
@@ -21,7 +21,8 @@ package org.apache.sysds.lops;
import java.util.ArrayList;
-
+
+import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
import org.apache.sysds.lops.LopProperties.ExecType;
import org.apache.sysds.common.Types.DataType;
@@ -31,12 +32,17 @@ public class SpoofFused extends Lop
{
private final Class<?> _class;
private final int _numThreads;
-
- public SpoofFused( ArrayList<Lop> inputs, DataType dt, ValueType vt, Class<?> cla, int k, ExecType etype) {
+ private final String _genVarName;
+
+ private GeneratorAPI _api;
+ public SpoofFused(ArrayList<Lop> inputs, DataType dt, ValueType vt, Class<?> cla, GeneratorAPI api,
+ String genVarName, int k, ExecType etype) {
super(Type.SpoofFused, dt, vt);
_class = cla;
_numThreads = k;
-
+ _api = api;
+ _genVarName = genVarName;
+
for( Lop lop : inputs ) {
addInput(lop);
lop.addOutput(this);
@@ -47,7 +53,11 @@ public class SpoofFused extends Lop
@Override
public String toString() {
- return "spoof("+_class.getSimpleName()+")";
+ if(_class != null)
+ return "spoof("+_class.getSimpleName()+")";
+ else
+ return "spoof(" + _genVarName + ")";
+
}
@Override
@@ -98,8 +108,14 @@ public class SpoofFused extends Lop
sb.append( "spoof" );
sb.append( OPERAND_DELIMITOR );
- sb.append( _class.getName() );
-
+ sb.append( _api);
+ sb.append( OPERAND_DELIMITOR );
+ if(_class != null)
+ sb.append( _class.getName() );
+ else
+ sb.append("codegen." + _genVarName);
+
+
for(int i=0; i < inputs.length; i++) {
sb.append( OPERAND_DELIMITOR );
sb.append( getInputs().get(i).prepInputOperand(inputs[i]));
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/CodegenUtils.java b/src/main/java/org/apache/sysds/runtime/codegen/CodegenUtils.java
index 64ef9f8..d8d3d2b 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/CodegenUtils.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/CodegenUtils.java
@@ -63,7 +63,9 @@ public class CodegenUtils
//janino-specific map of source code transfer/recompile on-demand
private static ConcurrentHashMap<String, String> _src = new ConcurrentHashMap<>();
-
+
+ private static ConcurrentHashMap<String, SpoofCUDA> _native_op_data = new ConcurrentHashMap<>();
+
//javac-specific working directory for src/class files
private static String _workingDir = null;
@@ -156,7 +158,15 @@ public class CodegenUtils
return ret;
}
-
+
+ public static SpoofCUDA getNativeOpData(String name) {
+ return _native_op_data.get(name);
+ }
+
+ public static void putNativeOpData(SpoofCUDA op) {
+ _native_op_data.put(op.getName(), op);
+ }
+
public static SideInput createSideInput(MatrixBlock in) {
SideInput ret = (in.isInSparseFormat() || !in.isAllocated()) ?
new SideInput(null, in, in.getNumColumns()) :
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java
new file mode 100644
index 0000000..ac783c6
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.codegen;
+
+import java.util.ArrayList;
+
+import org.apache.sysds.hops.codegen.SpoofCompiler;
+import org.apache.sysds.hops.codegen.cplan.CNodeCell;
+import org.apache.sysds.hops.codegen.cplan.CNodeMultiAgg;
+import org.apache.sysds.hops.codegen.cplan.CNodeOuterProduct;
+import org.apache.sysds.hops.codegen.cplan.CNodeRow;
+import org.apache.sysds.hops.codegen.cplan.CNodeTpl;
+import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysds.runtime.instructions.cp.ScalarObject;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision;
+
+public class SpoofCUDA extends SpoofOperator {
+
+ private final CNodeTpl cnt;
+ public final String name;
+
+ public SpoofCUDA(CNodeTpl cnode) {
+ name = "codegen." + cnode.getVarname();
+ cnt = cnode;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public CNodeTpl getCNodeTemplate() {
+ return cnt;
+ }
+
+ public String getSpoofTemplateType() {
+ if (cnt instanceof CNodeCell)
+ return "CW";
+ else if(cnt instanceof CNodeRow)
+ return "RA";
+ else if(cnt instanceof CNodeMultiAgg)
+ return "MA";
+ else if(cnt instanceof CNodeOuterProduct)
+ return "OP";
+ else
+ throw new RuntimeException("unknown spoof operator type");
+ }
+ @Override
+ public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out) {
+ throw new RuntimeException("method not implemented for SpoofNativeCUDA");
+ }
+
+ public double execute(ArrayList<MatrixObject> inputs, ArrayList<ScalarObject> scalarObjects, MatrixObject out_obj,
+ ExecutionContext ec) {
+ double ret = 0;
+ long out_ptr = 0;
+
+ if(out_obj != null)
+ out_ptr = ec.getGPUPointerAddress(out_obj);
+
+ int offset = 1;
+ if(cnt instanceof CNodeOuterProduct)
+ offset = 2;
+
+ // only dense input preparation for now
+ long[] in_ptrs = new long[offset];
+ for(int i = 0; i < offset; ++i)
+ in_ptrs[i] = ec.getGPUPointerAddress(inputs.get(i));
+
+ long[] side_ptrs = new long[inputs.size() - offset];
+ for(int i = offset; i < inputs.size(); ++i)
+ side_ptrs[i - offset] = ec.getGPUPointerAddress(inputs.get(i));
+
+ if(isSinglePrecision()) {
+ float[] scalars = prepInputScalarsFloat(scalarObjects);
+
+ // ToDo: handle float
+ ret = execute_f(SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA), name.split("\\.")[1],
+ in_ptrs, side_ptrs, out_ptr, scalars, inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), 0);
+
+ }
+ else {
+ double[] scalars = prepInputScalars(scalarObjects);
+
+ ret = execute_d(SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA), name.split("\\.")[1],
+ in_ptrs, side_ptrs, out_ptr, scalars, inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), 0);
+ }
+ return ret;
+ }
+
+ @Override
+ public String getSpoofType() {
+ String tmp[] = getClass().getName().split("\\.");
+ return tmp[tmp.length-1] + "_" + getSpoofTemplateType() + "_" + name.split("\\.")[1];
+ }
+
+ private native float execute_f(long ctx, String name, long[] in_ptr, long[] side_ptr,
+ long out_ptr, float[] scalars, long m, long n, long grix);
+
+ private native double execute_d(long ctx, String name, long[] in_ptr, long[] side_ptr,
+ long out_ptr, double[] scalars, long m, long n, long grix);
+}
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
index 87ddfea..3088e84 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
@@ -145,7 +145,14 @@ public abstract class SpoofOperator implements Serializable
scalars[i] = scalarObjects.get(i).getDoubleValue();
return scalars;
}
-
+
+ protected static float[] prepInputScalarsFloat(ArrayList<ScalarObject> scalarObjects) {
+ float[] scalars = new float[scalarObjects.size()];
+ for(int i=0; i < scalarObjects.size(); i++)
+ scalars[i] = (float)scalarObjects.get(i).getDoubleValue();
+ return scalars;
+ }
+
public static long getTotalInputNnz(ArrayList<MatrixBlock> inputs) {
return inputs.stream().mapToLong(in -> in.getNonZeros()).sum();
}
diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
index a34b77e..fceaea4 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
@@ -72,7 +72,7 @@ public class ExecutionContext {
protected LocalVariableMap _variables;
protected long _tid = -1;
protected boolean _autoCreateVars;
-
+
//lineage map, cache, prepared dedup blocks
protected Lineage _lineage;
@@ -124,19 +124,19 @@ public class ExecutionContext {
public void setLineage(Lineage lineage) {
_lineage = lineage;
}
-
+
public boolean isAutoCreateVars() {
return _autoCreateVars;
}
-
+
public void setAutoCreateVars(boolean flag) {
_autoCreateVars = flag;
}
-
+
public void setTID(long tid) {
_tid = tid;
}
-
+
public long getTID() {
return _tid;
}
@@ -406,6 +406,14 @@ public class ExecutionContext {
return mo;
}
+ public long getGPUPointerAddress(MatrixObject obj) {
+
+ if(obj.getGPUObject(getGPUContext(0)) == null)
+ return 0;
+ else
+ return obj.getGPUObject(getGPUContext(0)).getPointerAddress();
+ }
+
public MatrixObject getMatrixInputForGPUInstruction(String varName, String opcode) {
GPUContext gCtx = getGPUContext(0);
MatrixObject mo = getMatrixObject(varName);
@@ -568,9 +576,9 @@ public class ExecutionContext {
return createFrameObject((FrameBlock) cb);
return null;
}
-
+
public static MatrixObject createMatrixObject(MatrixBlock mb) {
- MatrixObject ret = new MatrixObject(Types.ValueType.FP64,
+ MatrixObject ret = new MatrixObject(Types.ValueType.FP64,
OptimizerUtils.getUniqueTempFileName());
ret.acquireModify(mb);
ret.setMetaData(new MetaDataFormat(new MatrixCharacteristics(
@@ -580,7 +588,7 @@ public class ExecutionContext {
ret.release();
return ret;
}
-
+
public static FrameObject createFrameObject(FrameBlock fb) {
FrameObject ret = new FrameObject(OptimizerUtils.getUniqueTempFileName());
ret.acquireModify(fb);
@@ -589,7 +597,7 @@ public class ExecutionContext {
ret.release();
return ret;
}
-
+
public List<MatrixBlock> getMatrixInputs(CPOperand[] inputs) {
return getMatrixInputs(inputs, false);
}
diff --git a/src/main/java/org/apache/sysds/runtime/functionobjects/IntegerDivide.java b/src/main/java/org/apache/sysds/runtime/functionobjects/IntegerDivide.java
index 9d265f6..a49e0f8 100644
--- a/src/main/java/org/apache/sysds/runtime/functionobjects/IntegerDivide.java
+++ b/src/main/java/org/apache/sysds/runtime/functionobjects/IntegerDivide.java
@@ -52,7 +52,7 @@ public class IntegerDivide extends ValueFunction
/**
* NOTE: The R semantics of integer divide a%/%b are to compute the
* double division and subsequently cast to int. In case of a NaN
- * or +-INFINITY devision result, the overall output is NOT cast to
+ * or +-INFINITY division result, the overall output is NOT cast to
* int in order to prevent the special double values.
*
* @param in1 double input 1
@@ -61,7 +61,7 @@ public class IntegerDivide extends ValueFunction
*/
private static double executeIntDiv( double in1, double in2 )
{
- //compute normal double devision
+ //compute normal double division
double ret = in1 / in2;
//check for NaN/+-INF intermediate (cast to int would eliminate it)
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysds/runtime/instructions/GPUInstructionParser.java
index e895797..4fca2ad 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/GPUInstructionParser.java
@@ -38,6 +38,7 @@ import org.apache.sysds.runtime.instructions.gpu.MatrixReshapeGPUInstruction;
import org.apache.sysds.runtime.instructions.gpu.RelationalBinaryGPUInstruction;
import org.apache.sysds.runtime.instructions.gpu.ReorgGPUInstruction;
import org.apache.sysds.runtime.instructions.gpu.GPUInstruction.GPUINSTRUCTION_TYPE;
+import org.apache.sysds.runtime.instructions.gpu.SpoofCUDAInstruction;
public class GPUInstructionParser extends InstructionParser
{
@@ -157,7 +158,9 @@ public class GPUInstructionParser extends InstructionParser
String2GPUInstructionType.put( ">=" , GPUINSTRUCTION_TYPE.RelationalBinary);
// Indexing
- String2GPUInstructionType.put( RightIndex.OPCODE, GPUINSTRUCTION_TYPE.MatrixIndexing);
+ String2GPUInstructionType.put( RightIndex.OPCODE, GPUINSTRUCTION_TYPE.MatrixIndexing);
+
+ String2GPUInstructionType.put( "spoof" , GPUINSTRUCTION_TYPE.SpoofFused);
}
public static GPUInstruction parseSingleInstruction (String str ) {
@@ -217,7 +220,10 @@ public class GPUInstructionParser extends InstructionParser
case MatrixIndexing:
return MatrixIndexingGPUInstruction.parseInstruction(str);
-
+
+ case SpoofFused:
+ return SpoofCUDAInstruction.parseInstruction(str);
+
default:
throw new DMLRuntimeException("Invalid GPU Instruction Type: " + gputype );
}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/SpoofCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/SpoofCPInstruction.java
index e1d12f5..f4e262e 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/SpoofCPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/SpoofCPInstruction.java
@@ -55,11 +55,11 @@ public class SpoofCPInstruction extends ComputationCPInstruction {
String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
ArrayList<CPOperand> inlist = new ArrayList<>();
- Class<?> cla = CodegenUtils.getClass(parts[1]);
+ Class<?> cla = CodegenUtils.getClass(parts[2]);
SpoofOperator op = CodegenUtils.createInstance(cla);
String opcode = parts[0] + op.getSpoofType();
- for( int i=2; i<parts.length-2; i++ )
+ for( int i=3; i<parts.length-2; i++ )
inlist.add(new CPOperand(parts[i]));
CPOperand out = new CPOperand(parts[parts.length-2]);
int k = Integer.parseInt(parts[parts.length-1]);
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java
index e6f22e3..615e194 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java
@@ -33,7 +33,7 @@ import org.apache.sysds.utils.Statistics;
public abstract class GPUInstruction extends Instruction {
private static final Log LOG = LogFactory.getLog(GPUInstruction.class.getName());
-
+
public enum GPUINSTRUCTION_TYPE {
AggregateUnary,
AggregateBinary,
@@ -47,7 +47,8 @@ public abstract class GPUInstruction extends Instruction {
BuiltinUnary,
BuiltinBinary,
Builtin,
- MatrixIndexing
+ MatrixIndexing,
+ SpoofFused
}
// Memory/conversions
@@ -159,7 +160,7 @@ public abstract class GPUInstruction extends Instruction {
instOpcode = opcode;
_requiresLabelUpdate = super.requiresLabelUpdate();
}
-
+
@Override
public IType getType() {
return IType.GPU;
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
new file mode 100644
index 0000000..8049e87
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.instructions.gpu;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.sysds.common.Types;
+import org.apache.sysds.hops.codegen.cplan.CNodeCell;
+import org.apache.sysds.runtime.codegen.CodegenUtils;
+import org.apache.sysds.runtime.codegen.SpoofCellwise;
+import org.apache.sysds.runtime.codegen.SpoofOperator;
+import org.apache.sysds.runtime.codegen.SpoofCUDA;
+import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysds.runtime.instructions.InstructionUtils;
+import org.apache.sysds.runtime.instructions.cp.CPOperand;
+import org.apache.sysds.runtime.instructions.cp.ScalarObject;
+import org.apache.sysds.runtime.lineage.LineageItem;
+import org.apache.sysds.runtime.lineage.LineageItemUtils;
+import org.apache.sysds.runtime.lineage.LineageTraceable;
+import org.apache.sysds.runtime.instructions.cp.DoubleObject;
+
+import java.util.ArrayList;
+
+public class SpoofCUDAInstruction extends GPUInstruction implements LineageTraceable {
+ private final SpoofCUDA _op;
+ private final CPOperand[] _in;
+
+ public final CPOperand _out;
+
+ private SpoofCUDAInstruction(SpoofOperator op, CPOperand[] in, CPOperand out, String opcode, String istr) {
+ super(null, opcode, istr);
+
+ if(!(op instanceof SpoofCUDA))
+ throw new RuntimeException("SpoofGPUInstruction needs an operator of type SpoofNativeCUDA!");
+
+ _op = (SpoofCUDA) op;
+ _in = in;
+ _out = out;
+ instString = istr;
+ instOpcode = opcode;
+ }
+
+ public static SpoofCUDAInstruction parseInstruction(String str) {
+ String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
+
+ ArrayList<CPOperand> inlist = new ArrayList<>();
+ SpoofCUDA op = CodegenUtils.getNativeOpData(parts[2]);
+ String opcode = op.getSpoofType();
+
+ for( int i=3; i<parts.length-2; i++ )
+ inlist.add(new CPOperand(parts[i]));
+ CPOperand out = new CPOperand(parts[parts.length-2]);
+
+ return new SpoofCUDAInstruction(op, inlist.toArray(new CPOperand[0]), out, opcode, str);
+ }
+
+ @Override
+ public void processInstruction(ExecutionContext ec) {
+
+ //get input matrices and scalars, incl pinning of matrices
+ ArrayList<MatrixObject> inputs = new ArrayList<>();
+ ArrayList<ScalarObject> scalars = new ArrayList<>();
+ for (CPOperand input : _in) {
+ if(input.getDataType()== Types.DataType.MATRIX)
+ inputs.add(ec.getMatrixInputForGPUInstruction(input.getName(), getExtendedOpcode()));
+ else if(input.getDataType()== Types.DataType.SCALAR) {
+ //note: even if literal, it might be compiled as scalar placeholder
+ scalars.add(ec.getScalarInput(input));
+ }
+ }
+
+ // set the output dimensions to the hop node matrix dimensions
+ if( _out.getDataType() == Types.DataType.MATRIX) {
+ long rows = inputs.get(0).getNumRows();
+ long cols = inputs.get(0).getNumColumns();
+ if(_op.getSpoofTemplateType().contains("CW"))
+ if(((CNodeCell)_op.getCNodeTemplate()).getCellType() == SpoofCellwise.CellType.COL_AGG)
+ rows = 1;
+ else if(((CNodeCell)_op.getCNodeTemplate()).getCellType() == SpoofCellwise.CellType.ROW_AGG)
+ cols = 1;
+
+ MatrixObject out_obj = ec.getDenseMatrixOutputForGPUInstruction(_out.getName(), rows, cols).getKey();
+ ec.setMetaData(_out.getName(), out_obj.getNumRows(), out_obj.getNumColumns());
+ _op.execute(inputs, scalars, out_obj, ec);
+ ec.releaseMatrixOutputForGPUInstruction(_out.getName());
+ }
+ else if (_out.getDataType() == Types.DataType.SCALAR) {
+ ScalarObject out = new DoubleObject(_op.execute(inputs, scalars, null, ec));
+ ec.setScalarOutput(_out.getName(), out);
+ }
+
+ for (CPOperand input : _in)
+ if(input.getDataType()== Types.DataType.MATRIX)
+ ec.releaseMatrixInputForGPUInstruction(input.getName());
+ }
+
+ @Override
+ public Pair<String, LineageItem> getLineageItem(ExecutionContext ec) {
+ return Pair.of(_out.getName(),
+ new LineageItem(getOpcode(), LineageItemUtils.getLineage(ec, _in)));
+ }
+}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContextPool.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContextPool.java
index 15a345c..25f3059 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContextPool.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContextPool.java
@@ -21,6 +21,7 @@
import static jcuda.driver.JCudaDriver.cuDeviceGetCount;
import static jcuda.driver.JCudaDriver.cuInit;
+import static jcuda.runtime.JCuda.cudaGetDevice;
import static jcuda.runtime.JCuda.cudaGetDeviceProperties;
import java.util.ArrayList;
@@ -83,6 +84,8 @@ public class GPUContextPool {
* All these need be done once, and not per GPU
*/
public synchronized static void initializeGPU() {
+ if (initialized)
+ return;
initialized = true;
GPUContext.LOG.info("Initializing CUDA");
long start = System.nanoTime();
@@ -210,8 +213,7 @@ public class GPUContextPool {
public static synchronized List<GPUContext> reserveAllGPUContexts() {
if (reserved)
throw new DMLRuntimeException("Trying to re-reserve GPUs");
- if (!initialized)
- initializeGPU();
+ initializeGPU();
reserved = true;
LOG.trace("GPU : Reserved all GPUs");
return pool;
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index ad20f46..b2967bd 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -1006,4 +1006,23 @@ public class GPUObject {
return sb.toString();
}
+ private static long getPointerAddress(Pointer p) {
+ // WORKAROUND until a method like CUdeviceptr#getAddress exists in jCuda
+ class PointerWithAddress extends Pointer
+ {
+ PointerWithAddress(Pointer other)
+ {
+ super(other);
+ }
+ long getAddress()
+ {
+ return getNativePointer() + getByteOffset();
+ }
+ }
+ return new PointerWithAddress(p).getAddress();
+ }
+
+ public long getPointerAddress() {
+ return getPointerAddress(getDensePointer());
+ }
}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/JCudaKernels.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/JCudaKernels.java
index a6cd221..072bfca 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/JCudaKernels.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/JCudaKernels.java
@@ -41,7 +41,7 @@ import jcuda.runtime.JCuda;
public class JCudaKernels {
- private final static String ptxFileName = "/kernels/SystemDS.ptx";
+ private final static String ptxFileName = "/cuda/kernels/SystemDS.ptx";
private HashMap<String, CUfunction> kernels = new HashMap<>();
private CUmodule module;
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/spark/SpoofSPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/spark/SpoofSPInstruction.java
index 27f86c6..6dc2832 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/spark/SpoofSPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/spark/SpoofSPInstruction.java
@@ -87,11 +87,11 @@ public class SpoofSPInstruction extends SPInstruction {
//String opcode = parts[0];
ArrayList<CPOperand> inlist = new ArrayList<>();
- Class<?> cls = CodegenUtils.getClass(parts[1]);
- byte[] classBytes = CodegenUtils.getClassData(parts[1]);
+ Class<?> cls = CodegenUtils.getClass(parts[2]);
+ byte[] classBytes = CodegenUtils.getClassData(parts[2]);
String opcode = parts[0] + CodegenUtils.createInstance(cls).getSpoofType();
- for( int i=2; i<parts.length-2; i++ )
+ for( int i=3; i<parts.length-2; i++ )
inlist.add(new CPOperand(parts[i]));
CPOperand out = new CPOperand(parts[parts.length-2]);
//note: number of threads parts[parts.length-1] always ignored
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
index cb70436..5d33e2e 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
@@ -295,7 +295,7 @@ public class LibMatrixNative
LibMatrixDNN.conv2dBackwardData(filter, dout, outputBlock, params);
}
- private static boolean isSinglePrecision() {
+ public static boolean isSinglePrecision() {
return ConfigurationManager.getDMLConfig()
.getTextValue(DMLConfig.FLOATING_POINT_PRECISION).equals("single");
}
diff --git a/src/main/java/org/apache/sysds/utils/NativeHelper.java b/src/main/java/org/apache/sysds/utils/NativeHelper.java
index 36ec816..cd9f8f9 100644
--- a/src/main/java/org/apache/sysds/utils/NativeHelper.java
+++ b/src/main/java/org/apache/sysds/utils/NativeHelper.java
@@ -293,11 +293,15 @@ public class NativeHelper {
* @param optionalMsg message for debugging
* @return true if successfully loaded BLAS
*/
- private static boolean loadBLAS(String customLibPath, String blas, String optionalMsg) {
+ public static boolean loadBLAS(String customLibPath, String blas, String optionalMsg) {
// First attempt to load from custom library path
if((customLibPath != null) && (!customLibPath.equalsIgnoreCase("none"))) {
String libPath = customLibPath + File.separator + System.mapLibraryName(blas);
try {
+ // This fixes libPath if it already contained a prefix/suffix and mapLibraryName added another one.
+ libPath = libPath.replace("liblibsystemds", "libsystemds")
+ .replace(".dll.dll", ".dll")
+ .replace(".so.so", ".so");
System.load(libPath);
LOG.info("Loaded the library:" + libPath);
return true;
@@ -328,7 +332,7 @@ public class NativeHelper {
* @param libFileName library file name)
* @return true if successfully loaded BLAS
*/
- private static boolean loadLibraryHelperFromResource(String libFileName) {
+ public static boolean loadLibraryHelperFromResource(String libFileName) {
OutputStream out = null;
try(InputStream in = NativeHelper.class.getResourceAsStream("/lib/"+ libFileName)) {
// This logic is added because Java does not allow to load library from a resource file.
diff --git a/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java b/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
index 65093e2..6f887ce 100644
--- a/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
@@ -485,8 +485,8 @@ public class CellwiseTmplTest extends AutomatedTestBase
OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
runTest(true, false, null, -1);
- runRScript(true);
-
+ runRScript(true);
+
if(testname.equals(TEST_NAME6) || testname.equals(TEST_NAME7)
|| testname.equals(TEST_NAME9) || testname.equals(TEST_NAME10)) {
//compare scalars
@@ -504,7 +504,7 @@ public class CellwiseTmplTest extends AutomatedTestBase
if( !(rewrites && (testname.equals(TEST_NAME2)
|| testname.equals(TEST_NAME19))) && !testname.equals(TEST_NAME27) )
Assert.assertTrue(heavyHittersContainsSubString(
- "spoofCell", "sp_spoofCell", "spoofMA", "sp_spoofMA"));
+ "spoofCell", "sp_spoofCell", "spoofMA", "sp_spoofMA", "gpu_SpoofCUDA_CW_"));
if( testname.equals(TEST_NAME7) ) //ensure matrix mult is fused
Assert.assertTrue(!heavyHittersContainsSubString("tsmm"));
else if( testname.equals(TEST_NAME10) ) //ensure min/max is fused
diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties
index d1f4e58..b479997 100644
--- a/src/test/resources/log4j.properties
+++ b/src/test/resources/log4j.properties
@@ -25,6 +25,7 @@ log4j.logger.org.apache.sysds.api.DMLScript=OFF
log4j.logger.org.apache.sysds.test=INFO
log4j.logger.org.apache.sysds.test.AutomatedTestBase=ERROR
log4j.logger.org.apache.sysds=WARN
+#log4j.logger.org.apache.sysds.hops.codegen.SpoofCompiler=TRACE
log4j.logger.org.apache.sysds.runtime.compress.AbstractCompressedMatrixBlock=ERROR
# log4j.logger.org.apache.sysds.runtime.compress.CompressedMatrixBlockFactory=DEBUG
# log4j.logger.org.apache.sysds.runtime.compress.cocode=DEBUG
diff --git a/src/test/scripts/functions/codegen/SystemDS-config-codegen.xml b/src/test/scripts/functions/codegen/SystemDS-config-codegen.xml
index 1becb67..f77d94d 100644
--- a/src/test/scripts/functions/codegen/SystemDS-config-codegen.xml
+++ b/src/test/scripts/functions/codegen/SystemDS-config-codegen.xml
@@ -27,4 +27,6 @@
<!-- The number of theads for the spark instance artificially selected-->
<sysds.local.spark.number.threads>16</sysds.local.spark.number.threads>
+
+ <sysds.codegen.api>auto</sysds.codegen.api>
</root>
\ No newline at end of file