You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ma...@apache.org on 2022/04/20 12:17:54 UTC
[systemds] branch main updated (811e3f474c -> 29bf8f18ad)
This is an automated email from the ASF dual-hosted git repository.
markd pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
from 811e3f474c [SYSTEMDS-3334] Codegen RowMaxs_VectMult rewrite
new fc5b03de84 [SYSTEMDS-3352] CUDA code gen support for connected components
new 29bf8f18ad [SYSTEMDS-3352] CUDA code generation binaries
The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
.../cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so | Bin 302880 -> 285976 bytes
src/main/cuda/headers/Matrix.h | 52 +-
src/main/cuda/headers/spoof_utils.cuh | 215 ++--
src/main/cuda/headers/vector_write.cuh | 20 +-
src/main/cuda/kernels/reduction.ptx | 1185 ++++++++++++--------
src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp | 23 +-
src/main/cuda/spoof-launcher/SpoofRowwise.h | 8 +-
src/main/cuda/spoof-launcher/jni_bridge.cpp | 4 +-
src/main/cuda/spoof/rowwise.cu | 9 +-
.../apache/sysds/hops/codegen/cplan/CNodeRow.java | 7 +-
.../sysds/hops/codegen/cplan/cuda/Binary.java | 391 +++----
.../sysds/hops/codegen/cplan/cuda/Ternary.java | 88 +-
.../sysds/hops/codegen/cplan/cuda/Unary.java | 313 +++---
.../sysds/runtime/codegen/SpoofCUDACellwise.java | 4 +-
.../sysds/runtime/codegen/SpoofCUDARowwise.java | 4 +-
15 files changed, 1160 insertions(+), 1163 deletions(-)
[systemds] 01/02: [SYSTEMDS-3352] CUDA code gen support for connected components
Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
markd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
commit fc5b03de84ebd57214a69ca43f63f223dd258c89
Author: Mark Dokter <ma...@dokter.cc>
AuthorDate: Wed Apr 20 13:06:05 2022 +0200
[SYSTEMDS-3352] CUDA code gen support for connected components
General cleanup and bug fixing to make components.dml run.
Also contains improvements to handle single precision execution.
---
src/main/cuda/headers/Matrix.h | 52 ++-
src/main/cuda/headers/spoof_utils.cuh | 215 +++++------
src/main/cuda/headers/vector_write.cuh | 20 +-
src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp | 23 +-
src/main/cuda/spoof-launcher/SpoofRowwise.h | 8 +-
src/main/cuda/spoof-launcher/jni_bridge.cpp | 4 +-
src/main/cuda/spoof/rowwise.cu | 9 +-
.../apache/sysds/hops/codegen/cplan/CNodeRow.java | 7 +-
.../sysds/hops/codegen/cplan/cuda/Binary.java | 391 ++++++++-------------
.../sysds/hops/codegen/cplan/cuda/Ternary.java | 88 ++---
.../sysds/hops/codegen/cplan/cuda/Unary.java | 313 +++++++----------
.../sysds/runtime/codegen/SpoofCUDACellwise.java | 4 +-
.../sysds/runtime/codegen/SpoofCUDARowwise.java | 4 +-
13 files changed, 462 insertions(+), 676 deletions(-)
diff --git a/src/main/cuda/headers/Matrix.h b/src/main/cuda/headers/Matrix.h
index 61ef939b83..f02a76c83c 100644
--- a/src/main/cuda/headers/Matrix.h
+++ b/src/main/cuda/headers/Matrix.h
@@ -18,8 +18,6 @@
*/
#pragma once
-#ifndef SYSTEMDS_MATRIX_H
-#define SYSTEMDS_MATRIX_H
using uint32_t = unsigned int;
using int32_t = int;
@@ -43,22 +41,22 @@ struct Matrix {
#ifdef __CUDACC__
-template<typename T>
-uint32_t bin_search(T* values, uint32_t lower, uint32_t upper, T val) {
- upper -= 1;
- while(lower <= (upper-1)) {
- uint32_t idx = (lower + upper) >> 1;
- uint32_t vi = values[idx];
- if (vi < val)
- lower = idx + 1;
- else {
- if (vi <= val)
- return idx;
- upper = idx - 1;
- }
- }
- return upper + 1;
-}
+//template<typename T>
+//uint32_t bin_search(T* values, uint32_t lower, uint32_t upper, T val) {
+// upper -= 1;
+// while(lower <= (upper-1)) {
+// uint32_t idx = (lower + upper) >> 1;
+// uint32_t vi = values[idx];
+// if (vi < val)
+// lower = idx + 1;
+// else {
+// if (vi <= val)
+// return idx;
+// upper = idx - 1;
+// }
+// }
+// return upper + 1;
+//}
template<typename T>
class MatrixAccessor {
@@ -68,11 +66,11 @@ class MatrixAccessor {
public:
MatrixAccessor() = default;
- __device__ MatrixAccessor(Matrix<T>* mat) : _mat(mat) {}
+ __device__ explicit MatrixAccessor(Matrix<T>* mat) : _mat(mat) {}
__device__ void init(Matrix<T>* mat) { _mat = mat; }
- __device__ uint32_t& nnz() { return return _mat->row_ptr == nullptr ? _mat->rows * _mat->cols : _mat->nnz; }
+// __device__ uint32_t& nnz() { return _mat->row_ptr == nullptr ? _mat->rows * _mat->cols : _mat->nnz; }
__device__ uint32_t cols() { return _mat->cols; }
__device__ uint32_t rows() { return _mat->rows; }
@@ -96,14 +94,14 @@ public:
}
__device__ uint32_t row_len(uint32_t rix) {
- return _mat->row_ptr == nullptr ? row_len_dense(rix) : row_len_sparse(rix);
+ return _mat->row_ptr == nullptr ? _mat->rows : row_len_sparse(rix);
}
__device__ uint32_t* col_idxs(uint32_t rix) { return cols_sparse(rix); }
__device__ void set(uint32_t r, uint32_t c, T v) { set_sparse(r,c,v); }
- __device__ uint32_t* indexes() { return _mat->row_ptr; }
+// __device__ uint32_t* indexes() { return _mat->row_ptr; }
__device__ bool hasData() { return _mat->data != nullptr; }
private:
@@ -127,10 +125,6 @@ private:
return &(_mat->data[rix]);
}
- __device__ uint32_t row_len_dense(uint32_t rix) {
- return _mat->rows;
- }
-
//ToDo sparse accessors
__device__ uint32_t len_sparse() {
return _mat->row_ptr[_mat->rows];
@@ -145,8 +139,8 @@ private:
}
__device__ T& val_sparse_rc(uint32_t r, uint32_t c) {
-// printf("TBI: val_sparse_rc\n");
-// asm("trap;");
+ printf("TBI: val_sparse_rc(%d, %d)\n", r, c);
+ asm("trap;");
return _mat->data[0];
}
@@ -228,5 +222,3 @@ public:
};
#endif // __CUDACC_RTC__
-
-#endif //SYSTEMDS_MATRIX_H
diff --git a/src/main/cuda/headers/spoof_utils.cuh b/src/main/cuda/headers/spoof_utils.cuh
index 5d9b1012b2..8ab0fafdb2 100644
--- a/src/main/cuda/headers/spoof_utils.cuh
+++ b/src/main/cuda/headers/spoof_utils.cuh
@@ -18,8 +18,6 @@
*/
#pragma once
-#ifndef SPOOF_UTILS_CUH
-#define SPOOF_UTILS_CUH
#include <math_constants.h>
#include "vector_add.cuh"
@@ -31,13 +29,8 @@ struct TempStorage;
#include "Matrix.h"
#include "vector_write.cuh"
-// #include "intellisense_cuda_intrinsics.h"
-
using uint32_t = unsigned int;
-//static __device__ bool debug_row() { return blockIdx.x == 0; };
-//static __device__ bool debug_thread() { return threadIdx.x == 0; }
-
__constant__ double DOUBLE_EPS = 1.11022E-16; // 2 ^ -53
__constant__ double FLOAT_EPS = 1.49012E-08; // 2 ^ -26
__constant__ double EPSILON = 1E-11; // margin for comparisons ToDo: make consistent use of it
@@ -79,12 +72,6 @@ __device__ Vector<T>& getVector(MatrixAccessor<T>& data, uint32_t n, uint32_t ri
c[i] = data.val(rix, i);
i += blockDim.x;
}
-// if(debug_thread()) {
-// printf("getVector: c.len=%d rix=%d\n", c.length, rix);
-// for(auto j = 0; j < c.length; ++j)
-// printf("%4.3f ", c[j]);
-// printf("\n");
-// }
return c;
}
@@ -146,122 +133,147 @@ __device__ T BLOCK_ROW_AGG(T *a, T *b, uint32_t len, AggOp agg_op, LoadOp load_o
auto sdata = shared_memory_proxy<T>();
uint tid = threadIdx.x;
- // Initalize shared mem and leave if tid > row length.
-// if(tid >= len) { return sdata[tid] = AggOp::init();; }
-
- __syncthreads();
-
-// if(blockIdx.x == 0 && threadIdx.x == 0)
-// printf("tid=%d sdata[tid + 128]=%f, len=%d\n", tid, len, sdata[tid+128]);
uint i = tid;
T v = AggOp::init();
-// if(blockIdx.x == 0 && threadIdx.x == 0)
-// printf("tid=%d sdata[tid + 128]=%f\n", tid, sdata[tid+128]);
- while (i < len) {
+ while(i < len) {
v = agg_op(v, load_op(a[i], b[i]));
i += blockDim.x;
}
-// if(blockIdx.x == 0 && threadIdx.x == 0)
-// if(debug_row() && debug_thread())
-// printf("tid=%d sdata[tid + 128]=%f\n", tid, sdata[tid+128]);
-
// each thread puts its local sum into shared memory
sdata[tid] = v;
- // if(blockIdx.x==0)
- // printf("tid=%d v=%f, len=%d\n", tid, v, len);
__syncthreads();
- // if(blockIdx.x == 0 && threadIdx.x == 0)
- // printf("tid=%d sdata[tid + 128]=%f\n", tid, sdata[tid+128]);
-
// do reduction in shared mem
- if (blockDim.x >= 1024) {
- if (tid < 512 && (tid+512) < len) {
- // if(blockIdx.x == 0 && threadIdx.x == 0)
- // printf("tid=%d sdata[tid + 512]=%f\n", tid, sdata[tid+512]);
+ if(blockDim.x >= 1024) {
+ if(tid < 512 && (tid+512) < len) {
sdata[tid] = v = agg_op(v, sdata[tid + 512]);
}
__syncthreads();
}
- if (blockDim.x >= 512) {
- if (tid < 256 && (tid+256) < len) {
- // if(blockIdx.x == 0 && threadIdx.x == 0)
- // printf("tid=%d sdata[tid + 256]=%f\n", tid, sdata[tid+256]);
+ if(blockDim.x >= 512) {
+ if(tid < 256 && (tid+256) < len) {
sdata[tid] = v = agg_op(v, sdata[tid + 256]);
}
__syncthreads();
}
- if (blockDim.x >= 256) {
- if (tid < 128 && (tid+128) < len) {
- // if(blockIdx.x == 0 && threadIdx.x == 0)
- // printf("tid=%d sdata[tid + 128]=%f\n", tid, sdata[tid+128]);
+ if(blockDim.x >= 256) {
+ if(tid < 128 && (tid+128) < len) {
sdata[tid] = v = agg_op(v, sdata[tid + 128]);
}
__syncthreads();
}
- if (blockDim.x >= 128) {
- if (tid < 64 && (tid+64) < len) {
- // if(blockIdx.x == 0 && threadIdx.x == 0)
- // printf("tid=%d sdata[tid + 64]=%f\n", tid, sdata[tid+64]);
+ if(blockDim.x >= 128) {
+if(tid < 64 && (tid+64) < len) {
sdata[tid] = v = agg_op(v, sdata[tid + 64]);
}
__syncthreads();
}
-
- if (tid < 32) {
+
+ if(tid < 32) {
// now that we are using warp-synchronous programming (below)
// we need to declare our shared memory volatile so that the compiler
// doesn't reorder stores to it and induce incorrect behavior.
volatile T *smem = sdata;
- if (blockDim.x >= 64 && (tid+32) < len) {
+ if(blockDim.x >= 64 && (tid+32) < len) {
smem[tid] = v = agg_op(v, smem[tid + 32]);
}
- // if(blockIdx.x==0)
- // printf("tid=%d smem[0]=%f\n", tid, smem[0]);
- if (blockDim.x >= 32 && (tid+16) < len) {
+ if(blockDim.x >= 32 && (tid+16) < len) {
smem[tid] = v = agg_op(v, smem[tid + 16]);
}
- // if(blockIdx.x==0)
- // printf("tid=%d smem[0]=%f\n", tid, smem[0]);
- if (blockDim.x >= 16 && (tid+8) < len) {
+ if(blockDim.x >= 16 && (tid+8) < len) {
smem[tid] = v = agg_op(v, smem[tid + 8]);
}
- // if(blockIdx.x==0)
- // printf("tid=%d smem[0]=%f\n", tid, smem[0]);
- if (blockDim.x >= 8 && (tid+4) < len) {
+ if(blockDim.x >= 8 && (tid+4) < len) {
smem[tid] = v = agg_op(v, smem[tid + 4]);
}
- // if(blockIdx.x==0 && threadIdx.x ==0)
- // printf("tid=%d smem[tid + 4]=%f\n", tid, smem[tid+4]);
- if (blockDim.x >= 4 && (tid+2) < len) {
+ if(blockDim.x >= 4 && (tid+2) < len) {
smem[tid] = v = agg_op(v, smem[tid + 2]);
}
- // if(blockIdx.x==0 && threadIdx.x ==0)
- // printf("tid=%d smem[0]=%f\n", tid, smem[0]);
- if (blockDim.x >= 2 && (tid+1) < len) {
- // if (blockDim.x >= 2) {
+ if(blockDim.x >= 2 && (tid+1) < len) {
smem[tid] = v = agg_op(v, smem[tid + 1]);
}
-// if(blockIdx.x==0 && threadIdx.x ==0)
-// if(debug_row() && debug_thread())
-// printf("tid=%d smem[0]=%f\n", tid, smem[0]);
}
-
__syncthreads();
return sdata[0];
}
+
+template<typename T, typename AggOp, typename LoadOp>
+__device__ T BLOCK_ROW_AGG(T *a, T *b, uint32_t* aix, uint32_t len, AggOp agg_op, LoadOp load_op) {
+ auto sdata = shared_memory_proxy<T>();
+ uint tid = threadIdx.x;
+
+ uint i = tid;
+ T v = AggOp::init();
+ while(i < len) {
+ v = agg_op(v, load_op(a[i], b[aix[i]]));
+ i += blockDim.x;
+ }
+
+ // each thread puts its local sum into shared memory
+ sdata[tid] = v;
+ __syncthreads();
+
+ // do reduction in shared mem
+ if(blockDim.x >= 1024) {
+ if(tid < 512 && (tid+512) < len) {
+ sdata[tid] = v = agg_op(v, sdata[tid + 512]);
+ }
+ __syncthreads();
+ }
+ if(blockDim.x >= 512) {
+ if(tid < 256 && (tid+256) < len) {
+ sdata[tid] = v = agg_op(v, sdata[tid + 256]);
+ }
+ __syncthreads();
+ }
+ if(blockDim.x >= 256) {
+ if(tid < 128 && (tid+128) < len) {
+ sdata[tid] = v = agg_op(v, sdata[tid + 128]);
+ }
+ __syncthreads();
+ }
+ if(blockDim.x >= 128) {
+ if(tid < 64 && (tid+64) < len) {
+ sdata[tid] = v = agg_op(v, sdata[tid + 64]);
+ }
+ __syncthreads();
+ }
+
+ if(tid < 32) {
+ // now that we are using warp-synchronous programming (below)
+ // we need to declare our shared memory volatile so that the compiler
+ // doesn't reorder stores to it and induce incorrect behavior.
+ volatile T *smem = sdata;
+ if(blockDim.x >= 64 && (tid+32) < len) {
+ smem[tid] = v = agg_op(v, smem[tid + 32]);
+ }
+ if(blockDim.x >= 32 && (tid+16) < len) {
+ smem[tid] = v = agg_op(v, smem[tid + 16]);
+ }
+ if(blockDim.x >= 16 && (tid+8) < len) {
+ smem[tid] = v = agg_op(v, smem[tid + 8]);
+ }
+ if(blockDim.x >= 8 && (tid+4) < len) {
+ smem[tid] = v = agg_op(v, smem[tid + 4]);
+ }
+ if(blockDim.x >= 4 && (tid+2) < len) {
+ smem[tid] = v = agg_op(v, smem[tid + 2]);
+ }
+ if(blockDim.x >= 2 && (tid+1) < len) {
+ smem[tid] = v = agg_op(v, smem[tid + 1]);
+ }
+ }
+ __syncthreads();
+ return sdata[0];
+}
+
template<typename T>
__device__ T dotProduct(T* a, T* b, uint32_t ai, uint32_t bi, uint32_t len) {
SumOp<T> agg_op;
ProductOp<T> load_op;
-// if(debug_row() && debug_thread())
-// printf("dot len = %d\n", len);
- T ret = BLOCK_ROW_AGG(&a[ai], &b[bi], len, agg_op, load_op);
-// if(debug_row() && debug_thread())
-// printf("bid=%d, ai=%d, dot=%f\n", blockIdx.x, ai, ret);
- return ret;
+ return BLOCK_ROW_AGG(&a[ai], &b[bi], len, agg_op, load_op);
}
template<typename T>
@@ -277,8 +289,6 @@ __device__ T vectSum(T* a, uint32_t ai, uint32_t len) {
SumOp<T> agg_op;
IdentityOp<T> load_op;
T result = BLOCK_ROW_AGG(&a[ai], &a[ai], len, agg_op, load_op);
-// if(debug_row() && debug_thread())
-// printf("vectSum: bid=%d, tid=%d ai=%d len=%d result=%4.3f\n", blockIdx.x, threadIdx.x, ai, len, result);
return result;
}
@@ -286,10 +296,22 @@ template<typename T>
__device__ T vectMin(T* a, int ai, int len) {
MinOp<T> agg_op;
IdentityOp<T> load_op;
- T result = BLOCK_ROW_AGG(&a[ai], &a[ai], len, agg_op, load_op);
-// if(debug_row() && debug_thread())
-// printf("vectMin: bid=%d, tid=%d ai=%d len=%d result=%4.3f\n", blockIdx.x, threadIdx.x, ai, len, result);
- return result;
+ return BLOCK_ROW_AGG(&a[ai], &a[ai], len, agg_op, load_op);
+}
+
+template<typename T>
+__device__ T rowMaxsVectMult(T* a, T* b, uint32_t ai, uint32_t bi, uint32_t len) {
+ MaxOp<T> agg_op;
+ ProductOp<T> load_op;
+ return BLOCK_ROW_AGG(&a[ai], &b[0], len, agg_op, load_op);
+}
+
+template<typename T>
+__device__ T rowMaxsVectMult(T* a, T* b, uint32_t* aix, uint32_t ai, uint32_t bi, uint32_t len) {
+ MaxOp<T> agg_op;
+ ProductOp<T> load_op;
+
+ return BLOCK_ROW_AGG(&a[ai], &b[0], &aix[ai], len, agg_op, load_op);
}
template<typename T>
@@ -302,25 +324,7 @@ __device__ T vectMax(T* a, uint32_t ai, uint32_t len) {
template<typename T>
__device__ T vectMax(T* avals, uint32_t* aix, uint32_t ai, uint32_t alen, uint32_t len) {
-// if (debug_row() && debug_thread()) {
-// printf("\naix[i]:\n");
-// for(auto i = 0; i < alen; ++i)
-// printf(" %d", aix[i]);
-
-// printf("\navals[i]:\n");
-// for(auto i = 0; i < alen; ++i)
-// printf(" %4.3f", avals[i]);
-
-// printf("\navals[aix[i]]:\n");
-// for(auto i = 0; i < alen; ++i)
-// printf(" %4.3f", avals[aix[i]]);
-
-// printf("\n");
-// }
-
T result = vectMax(avals, ai, alen);
-// if (blockIdx.x < 5 && debug_thread())
-// printf("bid=%d, tid=%d, len=%d, alen=%d, ai=%d vectMax=%4.3f\n", blockIdx.x, threadIdx.x, len, alen, ai, result);
return alen < len ? MaxOp<T>::exec(result, 0.0) : result;
}
@@ -547,6 +551,12 @@ Vector<T>& vectMultWrite(T* a, T* b, uint32_t ai, uint32_t bi, uint32_t len, Tem
return vectWriteBinary<T, ProductOp<T>>(a, b, ai, bi, len, fop, "Mult");
}
+// sparse-dense MxV
+template<typename T>
+Vector<T>& vectMultWrite(T* avals, T* b, uint32_t* aix, uint32_t ai, uint32_t bi, uint32_t alen, uint32_t len, TempStorage<T>* fop) {
+ return vectWriteBinary<T, ProductOp<T>>(avals, b, aix, ai, bi, alen, len, fop, "Mult");
+}
+
template<typename T>
Vector<T>& vectDivWrite(T* a, T b, uint32_t ai, uint32_t len, TempStorage<T>* fop) {
return vectWriteBinary<T, DivOp<T>>(a, b, ai, len, fop, "Div");
@@ -744,6 +754,3 @@ void vectOuterMultAdd(T* a, T* b, T* c, uint32_t ai, uint32_t bi, uint32_t ci, u
i += blockDim.x;
}
}
-
-
-#endif // SPOOF_UTILS_CUH
diff --git a/src/main/cuda/headers/vector_write.cuh b/src/main/cuda/headers/vector_write.cuh
index 3099926167..55241bd8d4 100644
--- a/src/main/cuda/headers/vector_write.cuh
+++ b/src/main/cuda/headers/vector_write.cuh
@@ -18,10 +18,9 @@
*/
#pragma once
-#ifndef SYSTEMDS_VECTOR_WRITE_CUH
-#define SYSTEMDS_VECTOR_WRITE_CUH
-__device__ bool debug_row() { return blockIdx.x == 1; };
+#define DEBUG_ROW 2
+__device__ bool debug_row() { return blockIdx.x == DEBUG_ROW; };
__device__ bool debug_thread() { return threadIdx.x == 0; }
// unary transform vector by OP and write to intermediate vector
@@ -143,6 +142,19 @@ __device__ Vector<T>& vectWriteBinary(T* a, T* b, uint32_t ai, uint32_t bi, uint
return c;
}
+// sparse binary vect-vect to intermediate vector
+template<typename T, typename OP>
+__device__ Vector<T>& vectWriteBinary(T* a, T* b, uint32_t* aix, uint32_t ai, uint32_t bi, uint32_t alen, uint32_t len,
+ TempStorage<T>* fop, const char* name = nullptr) {
+ uint32_t i = threadIdx.x;
+ Vector<T>& c = fop->getTempStorage(len);
+ while (i < alen) {
+ c[aix[ai+i]] = OP::exec(a[ai + i], b[aix[ai+i]]);
+ i += blockDim.x;
+ }
+ return c;
+}
+
// binary vector-scalar to output vector c
template<typename T, typename OP>
__device__ void vectWriteBinary(T* a, T b, T* c, uint32_t ai, uint32_t ci, uint32_t len) {
@@ -168,5 +180,3 @@ __device__ void vectWriteBinary(T* a, T* b, T* c, uint32_t ai, uint32_t bi, uint
i += blockDim.x;
}
}
-
-#endif //SYSTEMDS_VECTOR_WRITE_CUH
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
index 2ef482d18d..c4ae1e3dff 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
@@ -47,11 +47,7 @@ size_t SpoofCUDAContext::initialize_cuda(uint32_t device_id, const char* resourc
s1 << "-I" << resource_path << "/cuda/headers";
s2 << "-I" << resource_path << "/cuda/spoof";
auto ctx = new SpoofCUDAContext(resource_path,{s1.str(), s2.str(), cuda_include_path});
- // cuda device is handled by jCuda atm
- //cudaSetDevice(device_id);
- //cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
- //cudaDeviceSynchronize();
-
+
CHECK_CUDA(cuModuleLoad(&(ctx->reductions), std::string(ctx->resource_path + std::string("/cuda/kernels/reduction.ptx")).c_str()));
CUfunction func;
@@ -87,30 +83,13 @@ void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, [[maybe_unused]] uint
cudaFreeHost(ctx->staging_buffer);
cudaFree(ctx->device_buffer);
delete ctx;
- // cuda device is handled by jCuda atm
- //cudaDeviceReset();
}
size_t SpoofCUDAContext::compile(std::unique_ptr<SpoofOperator> op, const std::string &src) {
-#ifndef NDEBUG
-// std::cout << "---=== START source listing of spoof cuda kernel [ " << name << " ]: " << std::endl;
-// uint32_t line_num = 0;
-// std::istringstream src_stream(src);
-// for(std::string line; std::getline(src_stream, line); line_num++)
-// std::cout << line_num << ": " << line << std::endl;
-// std::cout << "---=== END source listing of spoof cuda kernel [ " << name << " ]." << std::endl;
- std::cout << "cwd: " << std::filesystem::current_path() << std::endl;
- std::cout << "include_paths: ";
- for_each (include_paths.begin(), include_paths.end(), [](const std::string& line){ std::cout << line << '\n';});
- std::cout << std::endl;
-#endif
-
-// uncomment all related lines for temporary timing output:
// auto compile_start = clk::now();
op->program = std::make_unique<jitify::Program>(kernel_cache.program(src, 0, include_paths));
// auto compile_end = clk::now();
// auto compile_duration = std::chrono::duration_cast<sec>(compile_end - compile_start).count();
-
compiled_ops.push_back(std::move(op));
// compile_total += compile_duration;
// std::cout << name << " compiled in "
diff --git a/src/main/cuda/spoof-launcher/SpoofRowwise.h b/src/main/cuda/spoof-launcher/SpoofRowwise.h
index 4465ac99fa..01ec5206aa 100644
--- a/src/main/cuda/spoof-launcher/SpoofRowwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofRowwise.h
@@ -18,15 +18,13 @@
*/
#pragma once
-#ifndef SYSTEMDS_SPOOFROWWISE_H
-#define SYSTEMDS_SPOOFROWWISE_H
#include "SpoofCUDAContext.h"
#include <algorithm>
template <typename T>
struct SpoofRowwise {
-
+
static void exec([[maybe_unused]] SpoofCUDAContext* ctx, SpoofOperator* _op, DataBufferWrapper* dbw) {
uint32_t NT=256;
T value_type;
@@ -56,7 +54,7 @@ struct SpoofRowwise {
CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&d_temp), temp_buf_size));
CHECK_CUDART(cudaMemsetAsync(d_temp, 0, temp_buf_size, op->stream));
}
-
+
std::string op_name(op->name + "_DENSE");
if(sparse_input)
op_name = std::string(op->name + "_SPARSE");
@@ -77,5 +75,3 @@ struct SpoofRowwise {
CHECK_CUDART(cudaFree(d_temp));
}
};
-
-#endif //SYSTEMDS_SPOOFROWWISE_H
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.cpp b/src/main/cuda/spoof-launcher/jni_bridge.cpp
index 5134d5e292..65f4a5a19f 100644
--- a/src/main/cuda/spoof-launcher/jni_bridge.cpp
+++ b/src/main/cuda/spoof-launcher/jni_bridge.cpp
@@ -165,7 +165,7 @@ int launch_spoof_operator([[maybe_unused]] JNIEnv *jenv, [[maybe_unused]] jclass
[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1f
(JNIEnv *jenv, jclass jobj, jlong ctx) {
- return launch_spoof_operator<double, SpoofCellwise<double>>(jenv, jobj, ctx);
+ return launch_spoof_operator<float, SpoofCellwise<float>>(jenv, jobj, ctx);
}
@@ -177,5 +177,5 @@ int launch_spoof_operator([[maybe_unused]] JNIEnv *jenv, [[maybe_unused]] jclass
[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1f
(JNIEnv *jenv, jclass jobj, jlong ctx) {
- return launch_spoof_operator<double, SpoofRowwise<double>>(jenv, jobj, ctx);
+ return launch_spoof_operator<float, SpoofRowwise<float>>(jenv, jobj, ctx);
}
\ No newline at end of file
diff --git a/src/main/cuda/spoof/rowwise.cu b/src/main/cuda/spoof/rowwise.cu
index b31ce0c2ce..917b8e7087 100644
--- a/src/main/cuda/spoof/rowwise.cu
+++ b/src/main/cuda/spoof/rowwise.cu
@@ -48,15 +48,14 @@ struct SpoofRowwiseOp //%HAS_TEMP_VECT%
a.init(A);
c.init(C);
- if(B)
- for(auto i = 0; i < NUM_B; ++i)
- b[i].init(&(B[i]));
+ if(B) {
+ for(auto i = 0; i < NUM_B; ++i)
+ b[i].init(&(B[i]));
+ }
}
__device__ __forceinline__ void exec_dense(uint32_t ai, uint32_t ci, uint32_t rix) {
//%BODY_dense%
- if (debug_row() && debug_thread())
- printf("c[0]=%4.3f\n", c.vals(0)[0]);
}
__device__ __forceinline__ void exec_sparse(uint32_t ai, uint32_t ci, uint32_t rix) {
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java
index 88844d6fbb..13c8e10fca 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java
@@ -56,9 +56,12 @@ public class CNodeRow extends CNodeTpl
private static final String TEMPLATE_NOAGG_OUT = " LibSpoofPrimitives.vectWrite(%IN%, c, ci, %LEN%);\n";
private static final String TEMPLATE_NOAGG_CONST_OUT_CUDA = "\t\tvectWrite(%IN%, c.vals(0), 0, ci, %LEN%);\n";
private static final String TEMPLATE_NOAGG_OUT_CUDA = "\t\tvectWrite(%IN%, c.vals(0), 0, ci, %LEN%);\n";
- private static final String TEMPLATE_ROWAGG_OUT_CUDA = "\t\tif(threadIdx.x == 0){\n\t\t\t*(c.vals(rix)) = %IN%;\n//printf(\"rix=%d TMP7=%f TMP8=%f %IN%=%f\\n\",rix, TMP7, TMP8,%IN%);\n}\n";
+// private static final String TEMPLATE_ROWAGG_OUT_CUDA = "\t\tif(threadIdx.x == 0){\n\t\t\t*(c.vals(rix)) = %IN%;\n//printf(\"rix=%d TMP7=%f TMP8=%f %IN%=%f\\n\",rix, TMP7, TMP8,%IN%);\n}\n";
+private static final String TEMPLATE_ROWAGG_OUT_CUDA = "\t\tif(threadIdx.x == 0){\n\t\t\t*(c.vals(rix)) = %IN%;\n\t\t}\n";
+// private static final String TEMPLATE_FULLAGG_OUT_CUDA =
+// "\t\tif(threadIdx.x == 0) {\n\t\t\tT old = atomicAdd(c.vals(0), %IN%);\n//\t\t\tprintf(\"bid=%d full_agg add %f to %f\\n\",blockIdx.x, %IN%, old);\n\t\t}\n";
private static final String TEMPLATE_FULLAGG_OUT_CUDA =
- "\t\tif(threadIdx.x == 0) {\n\t\t\tT old = atomicAdd(c.vals(0), %IN%);\n//\t\t\tprintf(\"bid=%d full_agg add %f to %f\\n\",blockIdx.x, %IN%, old);\n\t\t}\n";
+ "\t\tif(threadIdx.x == 0) {\n\t\tT old = atomicAdd(c.vals(0), %IN%);\n\t\t}\n";
public CNodeRow(ArrayList<CNode> inputs, CNode output ) {
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Binary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Binary.java
index 6d826b16b4..ec46e1196c 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Binary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Binary.java
@@ -42,267 +42,156 @@ public class Binary extends CodeTemplate
"\t\tVector<T>& %TMP% = vectCbindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN1%, %LEN2%, this);\n" :
"\t\tVector<T>& %TMP% = vectCbindWrite(%IN1%, %IN2%, %POS1%, %POS2%, %LEN1%, %LEN2%, this);\n";
}
-
- if(isSinglePrecision()) {
- switch(type) {
- case DOT_PRODUCT:
- return sparseLhs ? " T %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" : " T %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
- case VECT_MATRIXMULT:
- return sparseLhs ? " T[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" : " T[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
- case VECT_OUTERMULT_ADD:
- return sparseLhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : sparseRhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2v%, %OUT%, %POS1%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : " LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
- //vector-scalar-add operations
- case VECT_MULT_ADD:
- case VECT_DIV_ADD:
- case VECT_MINUS_ADD:
- case VECT_PLUS_ADD:
- case VECT_POW_ADD:
- case VECT_XOR_ADD:
- case VECT_MIN_ADD:
- case VECT_MAX_ADD:
- case VECT_EQUAL_ADD:
- case VECT_NOTEQUAL_ADD:
- case VECT_LESS_ADD:
- case VECT_LESSEQUAL_ADD:
- case VECT_GREATER_ADD:
- case VECT_GREATEREQUAL_ADD:
- case VECT_CBIND_ADD: {
- String vectName = type.getVectorPrimitiveName();
- if(scalarVector)
- return sparseLhs ? " LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" : " LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
- else
- return sparseLhs ? " LibSpoofPrimitives.vect" + vectName + "Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" : " LibSpoofPrimitives.vect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
- }
+ switch(type) {
+ case ROWMAXS_VECTMULT:
+ return sparseLhs ? "\t\tT %TMP% = rowMaxsVectMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" :
+ "\t\tT %TMP% = rowMaxsVectMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+ case DOT_PRODUCT:
+ return sparseLhs ? "\t\tT %TMP% = dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" : " T %TMP% = dotProduct(%IN1%, %IN2%, %POS1%, static_cast<uint32_t>(%POS2%), %LEN%);\n";
- //vector-scalar operations
- case VECT_MULT_SCALAR:
- case VECT_DIV_SCALAR:
- case VECT_MINUS_SCALAR:
- case VECT_PLUS_SCALAR:
- case VECT_POW_SCALAR:
- case VECT_XOR_SCALAR:
- case VECT_BITWAND_SCALAR:
- case VECT_MIN_SCALAR:
- case VECT_MAX_SCALAR:
- case VECT_EQUAL_SCALAR:
- case VECT_NOTEQUAL_SCALAR:
- case VECT_LESS_SCALAR:
- case VECT_LESSEQUAL_SCALAR:
- case VECT_GREATER_SCALAR:
- case VECT_GREATEREQUAL_SCALAR: {
- String vectName = type.getVectorPrimitiveName();
- if(scalarVector)
- return sparseRhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS2%, %LEN%);\n";
- else
- return sparseLhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
- }
- //vector-vector operations
- case VECT_MULT:
- case VECT_DIV:
- case VECT_MINUS:
- case VECT_PLUS:
- case VECT_XOR:
- case VECT_BITWAND:
- case VECT_BIASADD:
- case VECT_BIASMULT:
- case VECT_MIN:
- case VECT_MAX:
- case VECT_EQUAL:
- case VECT_NOTEQUAL:
- case VECT_LESS:
- case VECT_LESSEQUAL:
- case VECT_GREATER:
- case VECT_GREATEREQUAL: {
- String vectName = type.getVectorPrimitiveName();
- return sparseLhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN%);\n" : sparseRhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2v%, %POS1%, %IN2i%, %POS2%, alen, %LEN%);\n" : " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
- }
+ case VECT_MATRIXMULT:
+ return sparseLhs ? " T[] %TMP% = vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" : " Vector<T>& %TMP% = vectMatrixMult(%IN1%, %IN2%, %POS1%, static_cast<uint32_t>(%POS2%), %LEN%, this);\n";
+ case VECT_OUTERMULT_ADD:
+ return sparseLhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : sparseRhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2v%, %OUT%, %POS1%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : "\t\tvectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
- //scalar-scalar operations
- case MULT:
- return " T %TMP% = %IN1% * %IN2%;\n";
- case DIV:
- return " T %TMP% = %IN1% / %IN2%;\n";
- case PLUS:
- return " T %TMP% = %IN1% + %IN2%;\n";
- case MINUS:
- return " T %TMP% = %IN1% - %IN2%;\n";
- case MODULUS:
- return " T %TMP% = modulus(%IN1%, %IN2%);\n";
- case INTDIV:
- return " T %TMP% = intDiv(%IN1%, %IN2%);\n";
- case LESS:
- return " T %TMP% = (%IN1% < %IN2%) ? 1 : 0;\n";
- case LESSEQUAL:
- return " T %TMP% = (%IN1% <= %IN2%) ? 1 : 0;\n";
- case GREATER:
- return " T %TMP% = (%IN1% > %IN2%) ? 1 : 0;\n";
- case GREATEREQUAL:
- return " T %TMP% = (%IN1% >= %IN2%) ? 1 : 0;\n";
- case EQUAL:
- return " T %TMP% = (%IN1% == %IN2%) ? 1 : 0;\n";
- case NOTEQUAL:
- return " T %TMP% = (%IN1% != %IN2%) ? 1 : 0;\n";
-
- case MIN:
- return " T %TMP% = fminf(%IN1%, %IN2%);\n";
- case MAX:
- return " T %TMP% = fmaxf(%IN1%, %IN2%);\n";
- case LOG:
- return " T %TMP% = logf(%IN1%)/Math.log(%IN2%);\n";
- case LOG_NZ:
- return " T %TMP% = (%IN1% == 0) ? 0 : logf(%IN1%) / logf(%IN2%);\n";
- case POW:
- return " T %TMP% = powf(%IN1%, %IN2%);\n";
- case MINUS1_MULT:
- return " T %TMP% = 1 - %IN1% * %IN2%;\n";
- case MINUS_NZ:
- return " T %TMP% = (%IN1% != 0) ? %IN1% - %IN2% : 0;\n";
- case XOR:
- return " T %TMP% = ( (%IN1% != 0) != (%IN2% != 0) ) ? 1.0f : 0.0f;\n";
- case BITWAND:
- return " T %TMP% = bwAnd(%IN1%, %IN2%);\n";
- case SEQ_RIX:
- return " T %TMP% = %IN1% + grix * %IN2%;\n"; //0-based global rix
-
- default:
- throw new RuntimeException("Invalid binary type: " + this.toString());
+ //vector-scalar-add operations
+ case VECT_MULT_ADD:
+ case VECT_DIV_ADD:
+ case VECT_MINUS_ADD:
+ case VECT_PLUS_ADD:
+ case VECT_POW_ADD:
+ case VECT_XOR_ADD:
+ case VECT_MIN_ADD:
+ case VECT_MAX_ADD:
+ case VECT_EQUAL_ADD:
+ case VECT_NOTEQUAL_ADD:
+ case VECT_LESS_ADD:
+ case VECT_LESSEQUAL_ADD:
+ case VECT_GREATER_ADD:
+ case VECT_GREATEREQUAL_ADD:
+ case VECT_CBIND_ADD: {
+ String vectName = type.getVectorPrimitiveName();
+ if(scalarVector)
+ return sparseLhs ? "\t\tvect" + vectName + "Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" : "\t\tvect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
+ else
+ return sparseLhs ? "\t\tvect" + vectName + "Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" : "\t\tvect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS1%, static_cast<uint32_t>(%POSOUT%), %LEN%);\n";
}
- }
- else {
- switch(type) {
- case DOT_PRODUCT:
-// return sparseLhs ? " T %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" : " T %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
-// return sparseLhs ? " T %TMP% = dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" : " T %TMP% = dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n printf(\"dot=%f, bid=%d, tid=%d\\n\",TMP7,blockIdx.x, threadIdx.x);\n __syncthreads();\n";
- return sparseLhs ? " T %TMP% = dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" : " T %TMP% = dotProduct(%IN1%, %IN2%, %POS1%, static_cast<uint32_t>(%POS2%), %LEN%);\n";
-
- case VECT_MATRIXMULT:
- return sparseLhs ? " T[] %TMP% = vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" : " Vector<T>& %TMP% = vectMatrixMult(%IN1%, %IN2%, %POS1%, static_cast<uint32_t>(%POS2%), %LEN%, this);\n";
- case VECT_OUTERMULT_ADD:
- return sparseLhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : sparseRhs ? " LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2v%, %OUT%, %POS1%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" : "\t\tvectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
-
- //vector-scalar-add operations
- case VECT_MULT_ADD:
- case VECT_DIV_ADD:
- case VECT_MINUS_ADD:
- case VECT_PLUS_ADD:
- case VECT_POW_ADD:
- case VECT_XOR_ADD:
- case VECT_MIN_ADD:
- case VECT_MAX_ADD:
- case VECT_EQUAL_ADD:
- case VECT_NOTEQUAL_ADD:
- case VECT_LESS_ADD:
- case VECT_LESSEQUAL_ADD:
- case VECT_GREATER_ADD:
- case VECT_GREATEREQUAL_ADD:
- case VECT_CBIND_ADD: {
- String vectName = type.getVectorPrimitiveName();
- if(scalarVector)
- return sparseLhs ? "\t\tvect" + vectName + "Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" : "\t\tvect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
- else
- return sparseLhs ? "\t\tvect" + vectName + "Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" : "\t\tvect" + vectName + "Add(%IN1%, %IN2%, %OUT%, %POS1%, static_cast<uint32_t>(%POSOUT%), %LEN%);\n";
- }
- //vector-scalar operations
- case VECT_MULT_SCALAR:
- case VECT_DIV_SCALAR:
- case VECT_MINUS_SCALAR:
- case VECT_PLUS_SCALAR:
- case VECT_POW_SCALAR:
- case VECT_XOR_SCALAR:
- case VECT_BITWAND_SCALAR:
- case VECT_MIN_SCALAR:
- case VECT_MAX_SCALAR:
- case VECT_EQUAL_SCALAR:
- case VECT_NOTEQUAL_SCALAR:
- case VECT_LESS_SCALAR:
- case VECT_LESSEQUAL_SCALAR:
- case VECT_GREATER_SCALAR:
- case VECT_GREATEREQUAL_SCALAR: {
- String vectName = type.getVectorPrimitiveName();
- if(scalarVector)
- return sparseRhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : " Vector<T>& %TMP% = vect" + vectName + "Write(%IN1%, %IN2%, %POS2%, %LEN%, this);\n";
- else
+ //vector-scalar operations
+ case VECT_MULT_SCALAR:
+ case VECT_DIV_SCALAR:
+ case VECT_MINUS_SCALAR:
+ case VECT_PLUS_SCALAR:
+ case VECT_POW_SCALAR:
+ case VECT_XOR_SCALAR:
+ case VECT_BITWAND_SCALAR:
+ case VECT_MIN_SCALAR:
+ case VECT_MAX_SCALAR:
+ case VECT_EQUAL_SCALAR:
+ case VECT_NOTEQUAL_SCALAR:
+ case VECT_LESS_SCALAR:
+ case VECT_LESSEQUAL_SCALAR:
+ case VECT_GREATER_SCALAR:
+ case VECT_GREATEREQUAL_SCALAR: {
+ String vectName = type.getVectorPrimitiveName();
+ if(scalarVector)
+ return sparseRhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : " Vector<T>& %TMP% = vect" + vectName + "Write(%IN1%, %IN2%, %POS2%, %LEN%, this);\n";
+ else
// return sparseLhs ? " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : " T[] %TMP% = LibSpoofPrimitives.vect" + vectName + "Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
- return sparseLhs ? " Vector<T>& %TMP% = vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%, this);\n" : " Vector<T>& %TMP% = vect" + vectName + "Write(%IN1%, %IN2%, static_cast<uint32_t>(%POS1%), %LEN%, this);\n";
- }
-
- //vector-vector operations
- case VECT_MULT:
- case VECT_DIV:
- case VECT_MINUS:
- case VECT_PLUS:
- case VECT_XOR:
- case VECT_BITWAND:
- case VECT_BIASADD:
- case VECT_BIASMULT:
- case VECT_MIN:
- case VECT_MAX:
- case VECT_EQUAL:
- case VECT_NOTEQUAL:
- case VECT_LESS:
- case VECT_LESSEQUAL:
- case VECT_GREATER:
- case VECT_GREATEREQUAL: {
- String vectName = type.getVectorPrimitiveName();
- return sparseLhs ? " Vector<T>& %TMP% = vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, " +
- "alen, %LEN%);\n" : sparseRhs ? " Vector<T>& %TMP% = vect" + vectName + "Write(%IN1%, %IN2v%, " +
- "%POS1%, %IN2i%, %POS2%, alen, %LEN%);\n" : " Vector<T>& %TMP% = vect" + vectName +
- "Write(%IN1%, %IN2%, static_cast<uint32_t>(%POS1%), static_cast<uint32_t>(%POS2%), %LEN%, this);\n";
- }
+ return sparseLhs ? " Vector<T>& %TMP% = vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%, this);\n" : " Vector<T>& %TMP% = vect" + vectName + "Write(%IN1%, %IN2%, static_cast<uint32_t>(%POS1%), %LEN%, this);\n";
+ }
- //scalar-scalar operations
- case MULT:
- return " T %TMP% = %IN1% * %IN2%;\n";
- case DIV:
- return " T %TMP% = %IN1% / %IN2%;\n";
- case PLUS:
- return " T %TMP% = %IN1% + %IN2%;\n";
- case MINUS:
- return " T %TMP% = %IN1% - %IN2%;\n";
- case MODULUS:
- return " T %TMP% = modulus(%IN1%, %IN2%);\n";
- case INTDIV:
- return " T %TMP% = intDiv(%IN1%, %IN2%);\n";
- case LESS:
- return " T %TMP% = (%IN1% < %IN2%) ? 1.0 : 0.0;\n";
- case LESSEQUAL:
- return " T %TMP% = (%IN1% <= %IN2%) ? 1.0 : 0.0;\n";
- case GREATER:
- return " T %TMP% = (%IN1% > (%IN2% + EPSILON)) ? 1.0 : 0.0;\n";
- case GREATEREQUAL:
- return " T %TMP% = (%IN1% >= %IN2%) ? 1.0 : 0.0;\n";
- case EQUAL:
- return " T %TMP% = (%IN1% == %IN2%) ? 1.0 : 0.0;\n";
- case NOTEQUAL:
- return " T %TMP% = (%IN1% != %IN2%) ? 1.0 : 0.0;\n";
+ //vector-vector operations
+ case VECT_MULT:
+ case VECT_DIV:
+ case VECT_MINUS:
+ case VECT_PLUS:
+ case VECT_XOR:
+ case VECT_BITWAND:
+ case VECT_BIASADD:
+ case VECT_BIASMULT:
+ case VECT_MIN:
+ case VECT_MAX:
+ case VECT_EQUAL:
+ case VECT_NOTEQUAL:
+ case VECT_LESS:
+ case VECT_LESSEQUAL:
+ case VECT_GREATER:
+ case VECT_GREATEREQUAL: {
+ String vectName = type.getVectorPrimitiveName();
+ return sparseLhs ? " Vector<T>& %TMP% = vect" + vectName + "Write(%IN1v%, %IN2%, %IN1i%, " +
+ "%POS1%, %POS2%, alen, %LEN%, this);\n" :
+ sparseRhs ? " Vector<T>& %TMP% = vect" + vectName + "Write(%IN1%, %IN2v%, %POS1%, " +
+ "%IN2i%, %POS2%, alen, %LEN%);\n" :
+ " Vector<T>& %TMP% = vect" + vectName + "Write(%IN1%, %IN2%, " +
+ "static_cast<uint32_t>(%POS1%), static_cast<uint32_t>(%POS2%), %LEN%, this);\n";
+ }
- case MIN:
- return " T %TMP% = min(%IN1%, %IN2%);\n";
- case MAX:
- return " T %TMP% = max(%IN1%, %IN2%);\n";
- case LOG:
- return " T %TMP% = log(%IN1%)/Math.log(%IN2%);\n";
- case LOG_NZ:
- return " T %TMP% = (%IN1% == 0) ? 0 : log(%IN1%) / log(%IN2%);\n";
- case POW:
- return " T %TMP% = pow(%IN1%, %IN2%);\n";
- case MINUS1_MULT:
- return " T %TMP% = 1 - %IN1% * %IN2%;\n";
- case MINUS_NZ:
- return " T %TMP% = (%IN1% != 0) ? %IN1% - %IN2% : 0;\n";
- case XOR:
+ //scalar-scalar operations
+ case MULT:
+ return " T %TMP% = %IN1% * %IN2%;\n";
+ case DIV:
+ return "\t\tT %TMP% = %IN1% / %IN2%;\n";
+ case PLUS:
+ return "\t\tT %TMP% = %IN1% + %IN2%;\n";
+ case MINUS:
+ return " T %TMP% = %IN1% - %IN2%;\n";
+ case MODULUS:
+ return " T %TMP% = modulus(%IN1%, %IN2%);\n";
+ case INTDIV:
+ return " T %TMP% = intDiv(%IN1%, %IN2%);\n";
+ case LESS:
+ return " T %TMP% = (%IN1% < %IN2%) ? 1.0 : 0.0;\n";
+ case LESSEQUAL:
+ return " T %TMP% = (%IN1% <= %IN2%) ? 1.0 : 0.0;\n";
+ case GREATER:
+ return " T %TMP% = (%IN1% > (%IN2% + EPSILON)) ? 1.0 : 0.0;\n";
+ case GREATEREQUAL:
+ return " T %TMP% = (%IN1% >= %IN2%) ? 1.0 : 0.0;\n";
+ case EQUAL:
+ return " T %TMP% = (%IN1% == %IN2%) ? 1.0 : 0.0;\n";
+ case NOTEQUAL:
+ return "\t\tT %TMP% = (%IN1% != %IN2%) ? 1.0 : 0.0;\n";
+ case MIN:
+ if(isSinglePrecision())
+ return "\t\tT %TMP% = fminf(%IN1%, %IN2%);\n";
+ else
+ return "\t\tT %TMP% = min(%IN1%, %IN2%);\n";
+ case MAX:
+ if(isSinglePrecision())
+ return "\t\tT %TMP% = fmaxf(%IN1%, %IN2%);\n";
+ else
+ return "\t\tT %TMP% = max(%IN1%, %IN2%);\n";
+ case LOG:
+ if(isSinglePrecision())
+ return "\t\tT %TMP% = logf(%IN1%) / logf(%IN2%);\n";
+ else
+ return "\t\tT %TMP% = log(%IN1%) / log(%IN2%);\n";
+ case LOG_NZ:
+ if(isSinglePrecision())
+ return "\t\tT %TMP% = (%IN1% == 0) ? 0 : logf(%IN1%) / logf(%IN2%);\n";
+ else
+ return "\t\tT %TMP% = (%IN1% == 0) ? 0 : log(%IN1%) / log(%IN2%);\n";
+ case POW:
+ if(isSinglePrecision())
+ return "\t\tT %TMP% = powf(%IN1%, %IN2%);\n";
+ else
+ return "\t\tT %TMP% = pow(%IN1%, %IN2%);\n";
+ case MINUS1_MULT:
+ return " T %TMP% = 1 - %IN1% * %IN2%;\n";
+ case MINUS_NZ:
+ return " T %TMP% = (%IN1% != 0) ? %IN1% - %IN2% : 0;\n";
+ case XOR:
// return " T %TMP% = ( (%IN1% != 0.0) != (%IN2% != 0.0) ) ? 1.0 : 0.0;\n";
- return " T %TMP% = ( (%IN1% < EPSILON) != (%IN2% < EPSILON) ) ? 1.0 : 0.0;\n";
- case BITWAND:
- return " T %TMP% = bwAnd(%IN1%, %IN2%);\n";
- case SEQ_RIX:
- return " T %TMP% = %IN1% + grix * %IN2%;\n"; //0-based global rix
+ return " T %TMP% = ( (%IN1% < EPSILON) != (%IN2% < EPSILON) ) ? 1.0 : 0.0;\n";
+ case BITWAND:
+ return " T %TMP% = bwAnd(%IN1%, %IN2%);\n";
+ case SEQ_RIX:
+ return "\t\tT %TMP% = %IN1% + grix * %IN2%;\n"; //0-based global rix
- default:
- throw new RuntimeException("Invalid binary type: " + this.toString());
- }
+ default:
+ throw new RuntimeException("Invalid binary type: " + this.toString());
}
}
}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Ternary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Ternary.java
index dd06d6c004..026fe264f8 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Ternary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Ternary.java
@@ -28,81 +28,41 @@ public class Ternary extends CodeTemplate {
@Override
public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) {
- if(isSinglePrecision()) {
- switch (type) {
- case PLUS_MULT:
- return " T %TMP% = %IN1% + %IN2% * %IN3%;\n";
+ switch (type) {
+ case PLUS_MULT:
+ return " T %TMP% = %IN1% + %IN2% * %IN3%;\n";
- case MINUS_MULT:
- return " T %TMP% = %IN1% - %IN2% * %IN3%;\n";
+ case MINUS_MULT:
+ return " T %TMP% = %IN1% - %IN2% * %IN3%;\n";
- case BIASADD:
- return " T %TMP% = %IN1% + getValue(%IN2%, cix/%IN3%);\n";
+ case BIASADD:
+ return " T %TMP% = %IN1% + getValue(%IN2%, cix/%IN3%);\n";
- case BIASMULT:
- return " T %TMP% = %IN1% * getValue(%IN2%, cix/%IN3%);\n";
+ case BIASMULT:
+ return " T %TMP% = %IN1% * getValue(%IN2%, cix/%IN3%);\n";
- case REPLACE:
- return " T %TMP% = (%IN1% == %IN2% || (isnan(%IN1%) "
- + "&& isnan(%IN2%))) ? %IN3% : %IN1%;\n";
+ case REPLACE:
+ return " T %TMP% = (%IN1% == %IN2% || (isnan(%IN1%) "
+ + "&& isnan(%IN2%))) ? %IN3% : %IN1%;\n";
- case REPLACE_NAN:
- return " T %TMP% = isnan(%IN1%) ? %IN3% : %IN1%;\n";
+ case REPLACE_NAN:
+ return " T %TMP% = isnan(%IN1%) ? %IN3% : %IN1%;\n";
- case IFELSE:
- return " T %TMP% = (%IN1% != 0) ? %IN2% : %IN3%;\n";
+ case IFELSE:
+ return " T %TMP% = (%IN1% != 0) ? %IN2% : %IN3%;\n";
- case LOOKUP_RC1:
- return sparse ?
- " T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, %IN3%-1);\n" :
+ case LOOKUP_RC1:
+ return sparse ?
+ " T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, %IN3%-1);\n" :
// " T %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n";
- " T %TMP% = %IN1%.val(rix, %IN3%-1);\n";
+ " T %TMP% = %IN1%.val(rix, %IN3%-1);\n";
- case LOOKUP_RVECT1:
- return "\t\tVector<T>& %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1);\n";
- default:
- throw new RuntimeException("Invalid ternary type: " + this.toString());
- }
- }
- else {
- switch (type) {
- case PLUS_MULT:
- return " T %TMP% = %IN1% + %IN2% * %IN3%;\n";
-
- case MINUS_MULT:
- return " T %TMP% = %IN1% - %IN2% * %IN3%;\n";
-
- case BIASADD:
- return " T %TMP% = %IN1% + getValue(%IN2%, cix/%IN3%);\n";
-
- case BIASMULT:
- return " T %TMP% = %IN1% * getValue(%IN2%, cix/%IN3%);\n";
-
- case REPLACE:
- return " T %TMP% = (%IN1% == %IN2% || (isnan(%IN1%) "
- + "&& isnan(%IN2%))) ? %IN3% : %IN1%;\n";
-
- case REPLACE_NAN:
- return " T %TMP% = isnan(%IN1%) ? %IN3% : %IN1%;\n";
-
- case IFELSE:
- return " T %TMP% = (%IN1% != 0) ? %IN2% : %IN3%;\n";
-
- case LOOKUP_RC1:
- return sparse ?
- " T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, %IN3%-1);\n" :
-// " T %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n";
- " T %TMP% = %IN1%.val(rix, %IN3%-1);\n";
-
-
- case LOOKUP_RVECT1:
- return "\t\tVector<T>& %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1, this);\n";
-
- default:
- throw new RuntimeException("Invalid ternary type: "+this.toString());
- }
+ case LOOKUP_RVECT1:
+ return "\t\tVector<T>& %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1, this);\n";
+ default:
+ throw new RuntimeException("Invalid ternary type: "+this.toString());
}
}
}
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java
index f2405d5b5c..405b880715 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java
@@ -29,210 +29,161 @@ public class Unary extends CodeTemplate {
@Override
public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
- if(isSinglePrecision()) {
- switch( type ) {
- case ROW_SUMS:
- case ROW_SUMSQS:
- case ROW_MINS:
- case ROW_MAXS:
- case ROW_MEANS:
- case ROW_COUNTNNZS: {
- String vectName = StringUtils.capitalize(type.name().substring(4, type.name().length()-1).toLowerCase());
- return sparse ? " T %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, alen, len);\n":
- " T %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1%, %POS1%, %LEN%);\n";
- }
+ switch( type ) {
+ case ROW_SUMS:
+ case ROW_SUMSQS:
+ case ROW_MINS:
+ case ROW_MAXS:
+ case ROW_MEANS:
+ case ROW_COUNTNNZS: {
+ String vectName = StringUtils.capitalize(type.name().substring(4, type.name().length()-1).toLowerCase());
+ return sparse ? " T %TMP% = vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, alen, %LEN%);\n":
+ " T %TMP% = vect"+vectName+"(%IN1%, static_cast<uint32_t>(%POS1%), %LEN%);\n";
- case VECT_EXP:
- case VECT_POW2:
- case VECT_MULT2:
- case VECT_SQRT:
- case VECT_LOG:
- case VECT_ABS:
- case VECT_ROUND:
- case VECT_CEIL:
- case VECT_FLOOR:
- case VECT_SIGN:
- case VECT_SIN:
- case VECT_COS:
- case VECT_TAN:
- case VECT_ASIN:
- case VECT_ACOS:
- case VECT_ATAN:
- case VECT_SINH:
- case VECT_COSH:
- case VECT_TANH:
- case VECT_CUMSUM:
- case VECT_CUMMIN:
- case VECT_CUMMAX:
- case VECT_SPROP:
- case VECT_SIGMOID: {
- String vectName = type.getVectorPrimitiveName();
- return sparse ? " T[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, alen, len);\n" :
- " T[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %POS1%, %LEN%);\n";
- }
-
- case EXP:
- return " T %TMP% = expf(%IN1%);\n";
- case LOOKUP_R:
- return sparse ?
- " T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, 0);\n" :
- " T %TMP% = getValue(%IN1%, rix);\n";
- case LOOKUP_C:
- return " T %TMP% = getValue(%IN1%, n, 0, cix);\n";
- case LOOKUP_RC:
- return " T %TMP% = getValue(%IN1%, n, rix, cix);\n";
- case LOOKUP0:
- return " T %TMP% = %IN1%[0];\n";
- case POW2:
- return " T %TMP% = %IN1% * %IN1%;\n";
- case MULT2:
- return " T %TMP% = %IN1% + %IN1%;\n";
- case ABS:
- return " T %TMP% = fabsf(%IN1%);\n";
- case SIN:
- return " T %TMP% = sinf(%IN1%);\n";
- case COS:
- return " T %TMP% = cosf(%IN1%);\n";
- case TAN:
- return " T %TMP% = tanf(%IN1%);\n";
- case ASIN:
- return " T %TMP% = asinf(%IN1%);\n";
- case ACOS:
- return " T %TMP% = acosf(%IN1%);\n";
- case ATAN:
- return " T %TMP% = atanf(%IN1%);\n";
- case SINH:
- return " T %TMP% = sinhf(%IN1%);\n";
- case COSH:
- return " T %TMP% = coshf(%IN1%);\n";
- case TANH:
- return " T %TMP% = tanhf(%IN1%);\n";
- case SIGN:
- return " T %TMP% = signbit(%IN1%) == 0 ? 1.0f : -1.0f;\n";
- case SQRT:
- return " T %TMP% = sqrtf(%IN1%);\n";
- case LOG:
- return " T %TMP% = logf(%IN1%);\n";
- case ROUND:
- return " T %TMP% = roundf(%IN1%);\n";
- case CEIL:
- return " T %TMP% = ceilf(%IN1%);\n";
- case FLOOR:
- return " T %TMP% = floorf(%IN1%);\n";
- case SPROP:
- return " T %TMP% = %IN1% * (1 - %IN1%);\n";
- case SIGMOID:
- return " T %TMP% = 1 / (1 + expf(-%IN1%));\n";
- case LOG_NZ:
- return " T %TMP% = (%IN1%==0) ? 0 : logf(%IN1%);\n";
-
- default:
- throw new RuntimeException("Invalid unary type: "+this.toString());
}
- }
- else { /* double precision */
- switch( type ) {
- case ROW_SUMS:
- case ROW_SUMSQS:
- case ROW_MINS:
- case ROW_MAXS:
- case ROW_MEANS:
- case ROW_COUNTNNZS: {
- String vectName = StringUtils.capitalize(type.name().substring(4, type.name().length()-1).toLowerCase());
- return sparse ? " T %TMP% = vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, alen, %LEN%);\n":
- " T %TMP% = vect"+vectName+"(%IN1%, static_cast<uint32_t>(%POS1%), %LEN%);\n";
-
- }
- case VECT_EXP:
- case VECT_POW2:
- case VECT_MULT2:
- case VECT_SQRT:
- case VECT_LOG:
- case VECT_ABS:
- case VECT_ROUND:
- case VECT_CEIL:
- case VECT_FLOOR:
- case VECT_SIGN:
- case VECT_SIN:
- case VECT_COS:
- case VECT_TAN:
- case VECT_ASIN:
- case VECT_ACOS:
- case VECT_ATAN:
- case VECT_SINH:
- case VECT_COSH:
- case VECT_TANH:
- case VECT_CUMSUM:
- case VECT_CUMMIN:
- case VECT_CUMMAX:
- case VECT_SPROP:
- case VECT_SIGMOID: {
- String vectName = type.getVectorPrimitiveName();
- return sparse ? " Vector<T>& %TMP% = vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, alen, %LEN%, this);\n" :
- " Vector<T>& %TMP% = vect"+vectName+"Write(%IN1%, static_cast<uint32_t>(%POS1%), %LEN%, this);\n";
- }
+ case VECT_EXP:
+ case VECT_POW2:
+ case VECT_MULT2:
+ case VECT_SQRT:
+ case VECT_LOG:
+ case VECT_ABS:
+ case VECT_ROUND:
+ case VECT_CEIL:
+ case VECT_FLOOR:
+ case VECT_SIGN:
+ case VECT_SIN:
+ case VECT_COS:
+ case VECT_TAN:
+ case VECT_ASIN:
+ case VECT_ACOS:
+ case VECT_ATAN:
+ case VECT_SINH:
+ case VECT_COSH:
+ case VECT_TANH:
+ case VECT_CUMSUM:
+ case VECT_CUMMIN:
+ case VECT_CUMMAX:
+ case VECT_SPROP:
+ case VECT_SIGMOID: {
+ String vectName = type.getVectorPrimitiveName();
+ return sparse ? " Vector<T>& %TMP% = vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, alen, %LEN%, this);\n" :
+ " Vector<T>& %TMP% = vect"+vectName+"Write(%IN1%, static_cast<uint32_t>(%POS1%), %LEN%, this);\n";
+ }
- case EXP:
+ case EXP:
+ if(isSinglePrecision())
+ return " T %TMP% = expf(%IN1%);\n";
+ else
return " T %TMP% = exp(%IN1%);\n";
- case LOOKUP_R:
- return sparse ?
- " T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, 0);\n" :
- " T %TMP% = %IN1%.val(rix);\n";
-// " T %TMP% = getValue(%IN1%, rix);\n";
- case LOOKUP_C:
- return " T %TMP% = getValue(%IN1%, n, 0, cix);\n";
- case LOOKUP_RC:
- return " T %TMP% = getValue(%IN1%, n, rix, cix);\n";
- case LOOKUP0:
- return " T %TMP% = %IN1%[0];\n";
- case POW2:
- return " T %TMP% = %IN1% * %IN1%;\n";
- case MULT2:
- return " T %TMP% = %IN1% + %IN1%;\n";
- case ABS:
+ case LOOKUP_R:
+ return sparse ?
+ "\t\tT %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, 0);\n" :
+// " T %TMP% = %IN1%.val(rix);\n";
+ "\t\tT %TMP% = getValue(%IN1%, rix);\n";
+ case LOOKUP_C:
+ return "\t\tT %TMP% = getValue(%IN1%, n, 0, cix);\n";
+ case LOOKUP_RC:
+ return "\t\tT %TMP% = getValue(%IN1%, n, rix, cix);\n";
+ case LOOKUP0:
+ return "\t\tT %TMP% = %IN1%[0];\n";
+ case POW2:
+ return " T %TMP% = %IN1% * %IN1%;\n";
+ case MULT2:
+ return " T %TMP% = %IN1% + %IN1%;\n";
+ case ABS:
+ if(isSinglePrecision())
+ return " T %TMP% = fabsf(%IN1%);\n";
+ else
return "\t\tT %TMP% = fabs(%IN1%);\n";
- case SIN:
+ case SIN:
+ if(isSinglePrecision())
+ return " T %TMP% = sinf(%IN1%);\n";
+ else
return " T %TMP% = sin(%IN1%);\n";
- case COS:
+ case COS:
+ if(isSinglePrecision())
+ return " T %TMP% = cosf(%IN1%);\n";
+ else
return " T %TMP% = cos(%IN1%);\n";
- case TAN:
+ case TAN:
+ if(isSinglePrecision())
+ return " T %TMP% = tanf(%IN1%);\n";
+ else
return " T %TMP% = tan(%IN1%);\n";
- case ASIN:
+ case ASIN:
+ if(isSinglePrecision())
+ return " T %TMP% = asinf(%IN1%);\n";
+ else
return " T %TMP% = asin(%IN1%);\n";
- case ACOS:
+ case ACOS:
+ if(isSinglePrecision())
+ return " T %TMP% = acosf(%IN1%);\n";
+ else
return " T %TMP% = acos(%IN1%);\n";
- case ATAN:
+ case ATAN:
+ if(isSinglePrecision())
+ return " T %TMP% = atanf(%IN1%);\n";
+ else
return " T %TMP% = atan(%IN1%);\n";
- case SINH:
+ case SINH:
+ if(isSinglePrecision())
+ return " T %TMP% = sinhf(%IN1%);\n";
+ else
return " T %TMP% = sinh(%IN1%);\n";
- case COSH:
+ case COSH:
+ if(isSinglePrecision())
+ return " T %TMP% = coshf(%IN1%);\n";
+ else
return " T %TMP% = cosh(%IN1%);\n";
- case TANH:
+ case TANH:
+ if(isSinglePrecision())
+ return " T %TMP% = tanhf(%IN1%);\n";
+ else
return " T %TMP% = tanh(%IN1%);\n";
- case SIGN:
- return " T %TMP% = signbit(%IN1%) == 0 ? 1.0 : -1.0;\n";
- case SQRT:
+ case SIGN:
+ return " T %TMP% = signbit(%IN1%) == 0 ? 1.0 : -1.0;\n";
+ case SQRT:
+ if(isSinglePrecision())
+ return " T %TMP% = sqrtf(%IN1%);\n";
+ else
return " T %TMP% = sqrt(%IN1%);\n";
- case LOG:
+ case LOG:
+
+ if(isSinglePrecision())
+ return " T %TMP% = logf(%IN1%);\n";
+ else
return " T %TMP% = log(%IN1%);\n";
- case ROUND:
+ case ROUND:
+ if(isSinglePrecision())
+ return " T %TMP% = roundf(%IN1%);\n";
+ else
return "\t\tT %TMP% = round(%IN1%);\n";
- case CEIL:
+ case CEIL:
+ if(isSinglePrecision())
+ return " T %TMP% = ceilf(%IN1%);\n";
+ else
return " T %TMP% = ceil(%IN1%);\n";
- case FLOOR:
+ case FLOOR:
+ if(isSinglePrecision())
+ return " T %TMP% = floorf(%IN1%);\n";
+ else
return " T %TMP% = floor(%IN1%);\n";
- case SPROP:
- return " T %TMP% = %IN1% * (1 - %IN1%);\n";
- case SIGMOID:
+ case SPROP:
+ return " T %TMP% = %IN1% * (1 - %IN1%);\n";
+ case SIGMOID:
+ if(isSinglePrecision())
+ return " T %TMP% = 1 / (1 + expf(-%IN1%));\n";
+ else
return " T %TMP% = 1 / (1 + exp(-%IN1%));\n";
- case LOG_NZ:
+ case LOG_NZ:
+ if(isSinglePrecision())
+ return " T %TMP% = (%IN1%==0) ? 0 : logf(%IN1%);\n";
+ else
return " T %TMP% = (%IN1%==0) ? 0 : log(%IN1%);\n";
- default:
- throw new RuntimeException("Invalid unary type: "+this.toString());
- }
-
+ default:
+ throw new RuntimeException("Invalid unary type: "+this.toString());
}
}
}
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
index 03c35da540..cfe5780326 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
@@ -116,9 +116,9 @@ public class SpoofCUDACellwise extends SpoofCellwise implements SpoofCUDAOperato
}
public int execute_dp(long ctx) { return execute_d(ctx); }
- public int execute_sp(long ctx) { return execute_d(ctx); }
+ public int execute_sp(long ctx) { return execute_f(ctx); }
public long getContext() { return ctx; }
public static native int execute_d(long ctx);
- public static native int execute_s(long ctx);
+ public static native int execute_f(long ctx);
}
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
index 47826a9461..0adf2ec605 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
@@ -96,9 +96,9 @@ public class SpoofCUDARowwise extends SpoofRowwise implements SpoofCUDAOperator
int ci, int alen, int n, long grix, int rix) { }
public int execute_dp(long ctx) { return execute_d(ctx); }
- public int execute_sp(long ctx) { return execute_d(ctx); }
+ public int execute_sp(long ctx) { return execute_f(ctx); }
public long getContext() { return ctx; }
public static native int execute_d(long ctx);
- public static native int execute_s(long ctx);
+ public static native int execute_f(long ctx);
}
[systemds] 02/02: [SYSTEMDS-3352] CUDA code generation binaries
Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
markd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 29bf8f18ad4893bd22015ab4f5e46b6f8b7c218c
Author: Mark Dokter <ma...@dokter.cc>
AuthorDate: Wed Apr 20 14:12:41 2022 +0200
[SYSTEMDS-3352] CUDA code generation binaries
Code gen native support compiled on Ubuntu 20 LTS (still on CUDA 10.2 ofc)
---
.../cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so | Bin 302880 -> 285976 bytes
src/main/cuda/kernels/reduction.ptx | 1185 ++++++++++++--------
2 files changed, 698 insertions(+), 487 deletions(-)
diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so
index ec5be11087..81d1184b18 100644
Binary files a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so and b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so differ
diff --git a/src/main/cuda/kernels/reduction.ptx b/src/main/cuda/kernels/reduction.ptx
index 72b922596a..8b949f9dba 100644
--- a/src/main/cuda/kernels/reduction.ptx
+++ b/src/main/cuda/kernels/reduction.ptx
@@ -11,7 +11,14 @@
.address_size 64
// .globl double2float_f
+.extern .func (.param .b32 func_retval0) vprintf
+(
+ .param .b64 vprintf_param_0,
+ .param .b64 vprintf_param_1
+)
+;
.extern .shared .align 1 .b8 memory[];
+.global .align 1 .b8 $str[28] = {84, 66, 73, 58, 32, 118, 97, 108, 95, 115, 112, 97, 114, 115, 101, 95, 114, 99, 40, 37, 100, 44, 32, 37, 100, 41, 10, 0};
.visible .entry double2float_f(
.param .u64 double2float_f_param_0,
@@ -95,151 +102,151 @@ BB1_2:
.param .u32 reduce_sum_f_param_2
)
{
+ .local .align 8 .b8 __local_depot2[8];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
.reg .pred %p<25>;
.reg .f32 %f<69>;
- .reg .b32 %r<57>;
- .reg .b64 %rd<36>;
-
-
- ld.param.u64 %rd9, [reduce_sum_f_param_0];
- ld.param.u64 %rd10, [reduce_sum_f_param_1];
- ld.param.u32 %r13, [reduce_sum_f_param_2];
- mov.u32 %r14, %ctaid.x;
- shl.b32 %r15, %r14, 1;
- mov.u32 %r16, %ntid.x;
+ .reg .b32 %r<51>;
+ .reg .b64 %rd<38>;
+
+
+ mov.u64 %SPL, __local_depot2;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd10, [reduce_sum_f_param_0];
+ ld.param.u64 %rd11, [reduce_sum_f_param_1];
+ ld.param.u32 %r14, [reduce_sum_f_param_2];
+ mov.u32 %r15, %ctaid.x;
+ shl.b32 %r16, %r15, 1;
+ mov.u32 %r1, %ntid.x;
mov.u32 %r17, %tid.x;
- mad.lo.s32 %r56, %r15, %r16, %r17;
+ mad.lo.s32 %r48, %r16, %r1, %r17;
mov.f32 %f51, 0f00000000;
- setp.ge.u32 %p1, %r56, %r13;
+ setp.ge.u32 %p1, %r48, %r14;
@%p1 bra BB2_11;
- cvta.to.global.u64 %rd11, %rd9;
- ld.global.u64 %rd1, [%rd11+16];
+ cvta.to.global.u64 %rd12, %rd10;
+ ld.global.u64 %rd1, [%rd12+16];
setp.eq.s64 %p2, %rd1, 0;
- ld.global.u64 %rd12, [%rd11+32];
- cvta.to.global.u64 %rd2, %rd12;
+ ld.global.u64 %rd2, [%rd12+32];
+ mov.u32 %r18, %nctaid.x;
+ mul.lo.s32 %r19, %r1, %r18;
+ shl.b32 %r4, %r19, 1;
mov.f32 %f51, 0f00000000;
@%p2 bra BB2_8;
- mad.lo.s32 %r54, %r15, %r16, %r17;
- mov.f32 %f51, 0f00000000;
- mov.u64 %rd32, %rd1;
+ mov.u64 %rd34, %rd1;
BB2_3:
- cvta.to.global.u64 %rd13, %rd32;
- mul.wide.u32 %rd14, %r54, 4;
- add.s64 %rd15, %rd13, %rd14;
- ld.global.u32 %r27, [%rd15];
- mul.wide.u32 %rd16, %r27, 4;
- add.s64 %rd17, %rd2, %rd16;
- ld.global.f32 %f36, [%rd17];
+ mul.wide.u32 %rd13, %r48, 4;
+ add.s64 %rd14, %rd34, %rd13;
+ ld.u32 %r20, [%rd14];
+ mul.wide.u32 %rd15, %r20, 4;
+ add.s64 %rd16, %rd2, %rd15;
+ ld.f32 %f36, [%rd16];
add.f32 %f51, %f51, %f36;
- add.s32 %r55, %r54, %r16;
- setp.ge.u32 %p3, %r55, %r13;
+ add.s32 %r49, %r48, %r1;
+ setp.ge.u32 %p3, %r49, %r14;
@%p3 bra BB2_7;
- setp.eq.s64 %p4, %rd32, 0;
- mov.u64 %rd32, 0;
+ setp.eq.s64 %p4, %rd34, 0;
+ mov.u64 %rd34, 0;
@%p4 bra BB2_6;
- cvta.to.global.u64 %rd19, %rd1;
- mul.wide.u32 %rd20, %r55, 4;
- add.s64 %rd21, %rd19, %rd20;
- ld.global.u32 %r55, [%rd21];
- mov.u64 %rd32, %rd1;
+ mul.wide.u32 %rd18, %r49, 4;
+ add.s64 %rd19, %rd1, %rd18;
+ ld.u32 %r49, [%rd19];
+ mov.u64 %rd34, %rd1;
BB2_6:
- mul.wide.u32 %rd22, %r55, 4;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.f32 %f37, [%rd23];
+ mul.wide.u32 %rd20, %r49, 4;
+ add.s64 %rd21, %rd2, %rd20;
+ ld.f32 %f37, [%rd21];
add.f32 %f51, %f51, %f37;
BB2_7:
- shl.b32 %r30, %r16, 1;
- mov.u32 %r31, %nctaid.x;
- mad.lo.s32 %r54, %r30, %r31, %r54;
- setp.lt.u32 %p5, %r54, %r13;
+ shl.b32 %r23, %r1, 1;
+ mad.lo.s32 %r48, %r23, %r18, %r48;
+ setp.lt.u32 %p5, %r48, %r14;
@%p5 bra BB2_3;
bra.uni BB2_11;
BB2_8:
- mul.wide.u32 %rd24, %r56, 4;
- add.s64 %rd25, %rd2, %rd24;
- ld.global.f32 %f38, [%rd25];
+ mul.wide.u32 %rd22, %r48, 4;
+ add.s64 %rd23, %rd2, %rd22;
+ ld.f32 %f38, [%rd23];
add.f32 %f51, %f51, %f38;
- add.s32 %r10, %r56, %r16;
- setp.ge.u32 %p6, %r10, %r13;
+ add.s32 %r11, %r48, %r1;
+ setp.ge.u32 %p6, %r11, %r14;
@%p6 bra BB2_10;
- mul.wide.u32 %rd26, %r10, 4;
- add.s64 %rd27, %rd2, %rd26;
- ld.global.f32 %f39, [%rd27];
+ mul.wide.u32 %rd24, %r11, 4;
+ add.s64 %rd25, %rd2, %rd24;
+ ld.f32 %f39, [%rd25];
add.f32 %f51, %f51, %f39;
BB2_10:
- mov.u32 %r32, %nctaid.x;
- shl.b32 %r33, %r16, 1;
- mad.lo.s32 %r56, %r33, %r32, %r56;
- setp.lt.u32 %p7, %r56, %r13;
+ add.s32 %r48, %r48, %r4;
+ setp.lt.u32 %p7, %r48, %r14;
@%p7 bra BB2_8;
BB2_11:
- shl.b32 %r35, %r17, 2;
- mov.u32 %r36, memory;
- add.s32 %r12, %r36, %r35;
- st.shared.f32 [%r12], %f51;
+ shl.b32 %r26, %r17, 2;
+ mov.u32 %r27, memory;
+ add.s32 %r13, %r27, %r26;
+ st.shared.f32 [%r13], %f51;
bar.sync 0;
- setp.lt.u32 %p8, %r16, 1024;
+ setp.lt.u32 %p8, %r1, 1024;
@%p8 bra BB2_15;
setp.gt.u32 %p9, %r17, 511;
@%p9 bra BB2_14;
- ld.shared.f32 %f40, [%r12+2048];
+ ld.shared.f32 %f40, [%r13+2048];
add.f32 %f51, %f51, %f40;
- st.shared.f32 [%r12], %f51;
+ st.shared.f32 [%r13], %f51;
BB2_14:
bar.sync 0;
BB2_15:
- setp.lt.u32 %p10, %r16, 512;
+ setp.lt.u32 %p10, %r1, 512;
@%p10 bra BB2_19;
setp.gt.u32 %p11, %r17, 255;
@%p11 bra BB2_18;
- ld.shared.f32 %f41, [%r12+1024];
+ ld.shared.f32 %f41, [%r13+1024];
add.f32 %f51, %f51, %f41;
- st.shared.f32 [%r12], %f51;
+ st.shared.f32 [%r13], %f51;
BB2_18:
bar.sync 0;
BB2_19:
- setp.lt.u32 %p12, %r16, 256;
+ setp.lt.u32 %p12, %r1, 256;
@%p12 bra BB2_23;
setp.gt.u32 %p13, %r17, 127;
@%p13 bra BB2_22;
- ld.shared.f32 %f42, [%r12+512];
+ ld.shared.f32 %f42, [%r13+512];
add.f32 %f51, %f51, %f42;
- st.shared.f32 [%r12], %f51;
+ st.shared.f32 [%r13], %f51;
BB2_22:
bar.sync 0;
BB2_23:
- setp.lt.u32 %p14, %r16, 128;
+ setp.lt.u32 %p14, %r1, 128;
@%p14 bra BB2_27;
setp.gt.u32 %p15, %r17, 63;
@%p15 bra BB2_26;
- ld.shared.f32 %f43, [%r12+256];
+ ld.shared.f32 %f43, [%r13+256];
add.f32 %f51, %f51, %f43;
- st.shared.f32 [%r12], %f51;
+ st.shared.f32 [%r13], %f51;
BB2_26:
bar.sync 0;
@@ -248,72 +255,105 @@ BB2_27:
setp.gt.u32 %p16, %r17, 31;
@%p16 bra BB2_40;
- setp.lt.u32 %p17, %r16, 64;
+ setp.lt.u32 %p17, %r1, 64;
@%p17 bra BB2_30;
- ld.volatile.shared.f32 %f44, [%r12+128];
+ ld.volatile.shared.f32 %f44, [%r13+128];
add.f32 %f51, %f51, %f44;
- st.volatile.shared.f32 [%r12], %f51;
+ st.volatile.shared.f32 [%r13], %f51;
BB2_30:
- setp.lt.u32 %p18, %r16, 32;
+ setp.lt.u32 %p18, %r1, 32;
@%p18 bra BB2_32;
- ld.volatile.shared.f32 %f45, [%r12+64];
+ ld.volatile.shared.f32 %f45, [%r13+64];
add.f32 %f51, %f51, %f45;
- st.volatile.shared.f32 [%r12], %f51;
+ st.volatile.shared.f32 [%r13], %f51;
BB2_32:
- setp.lt.u32 %p19, %r16, 16;
+ setp.lt.u32 %p19, %r1, 16;
@%p19 bra BB2_34;
- ld.volatile.shared.f32 %f46, [%r12+32];
+ ld.volatile.shared.f32 %f46, [%r13+32];
add.f32 %f51, %f51, %f46;
- st.volatile.shared.f32 [%r12], %f51;
+ st.volatile.shared.f32 [%r13], %f51;
BB2_34:
- setp.lt.u32 %p20, %r16, 8;
+ setp.lt.u32 %p20, %r1, 8;
@%p20 bra BB2_36;
- ld.volatile.shared.f32 %f47, [%r12+16];
+ ld.volatile.shared.f32 %f47, [%r13+16];
add.f32 %f51, %f51, %f47;
- st.volatile.shared.f32 [%r12], %f51;
+ st.volatile.shared.f32 [%r13], %f51;
BB2_36:
- setp.lt.u32 %p21, %r16, 4;
+ setp.lt.u32 %p21, %r1, 4;
@%p21 bra BB2_38;
- ld.volatile.shared.f32 %f48, [%r12+8];
+ ld.volatile.shared.f32 %f48, [%r13+8];
add.f32 %f51, %f51, %f48;
- st.volatile.shared.f32 [%r12], %f51;
+ st.volatile.shared.f32 [%r13], %f51;
BB2_38:
- setp.lt.u32 %p22, %r16, 2;
+ setp.lt.u32 %p22, %r1, 2;
@%p22 bra BB2_40;
- ld.volatile.shared.f32 %f49, [%r12+4];
+ ld.volatile.shared.f32 %f49, [%r13+4];
add.f32 %f50, %f51, %f49;
- st.volatile.shared.f32 [%r12], %f50;
+ st.volatile.shared.f32 [%r13], %f50;
BB2_40:
setp.ne.s32 %p23, %r17, 0;
- @%p23 bra BB2_44;
+ @%p23 bra BB2_45;
ld.shared.f32 %f32, [memory];
- cvta.to.global.u64 %rd28, %rd10;
- ld.global.u64 %rd29, [%rd28+16];
- ld.global.u64 %rd30, [%rd28+32];
- cvta.to.global.u64 %rd35, %rd30;
- setp.ne.s64 %p24, %rd29, 0;
+ cvta.to.global.u64 %rd26, %rd11;
+ add.s64 %rd6, %rd26, 16;
+ ld.global.u64 %rd27, [%rd26+16];
+ setp.eq.s64 %p24, %rd27, 0;
@%p24 bra BB2_43;
- mul.wide.u32 %rd31, %r14, 4;
- add.s64 %rd35, %rd35, %rd31;
+ mov.u32 %r44, 0;
+ add.u64 %rd28, %SP, 0;
+ add.u64 %rd29, %SPL, 0;
+ st.local.u32 [%rd29], %r44;
+ st.local.u32 [%rd29+4], %r15;
+ mov.u64 %rd30, $str;
+ cvta.global.u64 %rd31, %rd30;
+ // Callseq Start 0
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd31;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd28;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r46, [retval0+0];
+
+ //{
+ }// Callseq End 0
+ // inline asm
+ trap;
+ // inline asm
+ ld.global.u64 %rd37, [%rd6+16];
+ bra.uni BB2_44;
BB2_43:
- st.global.f32 [%rd35], %f32;
+ ld.global.u64 %rd32, [%rd6+16];
+ mul.wide.u32 %rd33, %r15, 4;
+ add.s64 %rd37, %rd32, %rd33;
BB2_44:
+ st.f32 [%rd37], %f32;
+
+BB2_45:
ret;
}
@@ -324,151 +364,151 @@ BB2_44:
.param .u32 reduce_sum_d_param_2
)
{
+ .local .align 8 .b8 __local_depot3[8];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
.reg .pred %p<25>;
- .reg .b32 %r<57>;
+ .reg .b32 %r<51>;
.reg .f64 %fd<69>;
- .reg .b64 %rd<36>;
+ .reg .b64 %rd<38>;
- ld.param.u64 %rd9, [reduce_sum_d_param_0];
- ld.param.u64 %rd10, [reduce_sum_d_param_1];
- ld.param.u32 %r13, [reduce_sum_d_param_2];
- mov.u32 %r14, %ctaid.x;
- shl.b32 %r15, %r14, 1;
- mov.u32 %r16, %ntid.x;
+ mov.u64 %SPL, __local_depot3;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd10, [reduce_sum_d_param_0];
+ ld.param.u64 %rd11, [reduce_sum_d_param_1];
+ ld.param.u32 %r14, [reduce_sum_d_param_2];
+ mov.u32 %r15, %ctaid.x;
+ shl.b32 %r16, %r15, 1;
+ mov.u32 %r1, %ntid.x;
mov.u32 %r17, %tid.x;
- mad.lo.s32 %r56, %r15, %r16, %r17;
+ mad.lo.s32 %r48, %r16, %r1, %r17;
mov.f64 %fd51, 0d0000000000000000;
- setp.ge.u32 %p1, %r56, %r13;
+ setp.ge.u32 %p1, %r48, %r14;
@%p1 bra BB3_11;
- cvta.to.global.u64 %rd11, %rd9;
- ld.global.u64 %rd1, [%rd11+16];
+ cvta.to.global.u64 %rd12, %rd10;
+ ld.global.u64 %rd1, [%rd12+16];
setp.eq.s64 %p2, %rd1, 0;
- ld.global.u64 %rd12, [%rd11+32];
- cvta.to.global.u64 %rd2, %rd12;
+ ld.global.u64 %rd2, [%rd12+32];
+ mov.u32 %r18, %nctaid.x;
+ mul.lo.s32 %r19, %r1, %r18;
+ shl.b32 %r4, %r19, 1;
mov.f64 %fd51, 0d0000000000000000;
@%p2 bra BB3_8;
- mad.lo.s32 %r54, %r15, %r16, %r17;
- mov.f64 %fd51, 0d0000000000000000;
- mov.u64 %rd32, %rd1;
+ mov.u64 %rd34, %rd1;
BB3_3:
- cvta.to.global.u64 %rd13, %rd32;
- mul.wide.u32 %rd14, %r54, 4;
- add.s64 %rd15, %rd13, %rd14;
- ld.global.u32 %r27, [%rd15];
- mul.wide.u32 %rd16, %r27, 8;
- add.s64 %rd17, %rd2, %rd16;
- ld.global.f64 %fd36, [%rd17];
+ mul.wide.u32 %rd13, %r48, 4;
+ add.s64 %rd14, %rd34, %rd13;
+ ld.u32 %r20, [%rd14];
+ mul.wide.u32 %rd15, %r20, 8;
+ add.s64 %rd16, %rd2, %rd15;
+ ld.f64 %fd36, [%rd16];
add.f64 %fd51, %fd51, %fd36;
- add.s32 %r55, %r54, %r16;
- setp.ge.u32 %p3, %r55, %r13;
+ add.s32 %r49, %r48, %r1;
+ setp.ge.u32 %p3, %r49, %r14;
@%p3 bra BB3_7;
- setp.eq.s64 %p4, %rd32, 0;
- mov.u64 %rd32, 0;
+ setp.eq.s64 %p4, %rd34, 0;
+ mov.u64 %rd34, 0;
@%p4 bra BB3_6;
- cvta.to.global.u64 %rd19, %rd1;
- mul.wide.u32 %rd20, %r55, 4;
- add.s64 %rd21, %rd19, %rd20;
- ld.global.u32 %r55, [%rd21];
- mov.u64 %rd32, %rd1;
+ mul.wide.u32 %rd18, %r49, 4;
+ add.s64 %rd19, %rd1, %rd18;
+ ld.u32 %r49, [%rd19];
+ mov.u64 %rd34, %rd1;
BB3_6:
- mul.wide.u32 %rd22, %r55, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.f64 %fd37, [%rd23];
+ mul.wide.u32 %rd20, %r49, 8;
+ add.s64 %rd21, %rd2, %rd20;
+ ld.f64 %fd37, [%rd21];
add.f64 %fd51, %fd51, %fd37;
BB3_7:
- shl.b32 %r30, %r16, 1;
- mov.u32 %r31, %nctaid.x;
- mad.lo.s32 %r54, %r30, %r31, %r54;
- setp.lt.u32 %p5, %r54, %r13;
+ shl.b32 %r23, %r1, 1;
+ mad.lo.s32 %r48, %r23, %r18, %r48;
+ setp.lt.u32 %p5, %r48, %r14;
@%p5 bra BB3_3;
bra.uni BB3_11;
BB3_8:
- mul.wide.u32 %rd24, %r56, 8;
- add.s64 %rd25, %rd2, %rd24;
- ld.global.f64 %fd38, [%rd25];
+ mul.wide.u32 %rd22, %r48, 8;
+ add.s64 %rd23, %rd2, %rd22;
+ ld.f64 %fd38, [%rd23];
add.f64 %fd51, %fd51, %fd38;
- add.s32 %r10, %r56, %r16;
- setp.ge.u32 %p6, %r10, %r13;
+ add.s32 %r11, %r48, %r1;
+ setp.ge.u32 %p6, %r11, %r14;
@%p6 bra BB3_10;
- mul.wide.u32 %rd26, %r10, 8;
- add.s64 %rd27, %rd2, %rd26;
- ld.global.f64 %fd39, [%rd27];
+ mul.wide.u32 %rd24, %r11, 8;
+ add.s64 %rd25, %rd2, %rd24;
+ ld.f64 %fd39, [%rd25];
add.f64 %fd51, %fd51, %fd39;
BB3_10:
- mov.u32 %r32, %nctaid.x;
- shl.b32 %r33, %r16, 1;
- mad.lo.s32 %r56, %r33, %r32, %r56;
- setp.lt.u32 %p7, %r56, %r13;
+ add.s32 %r48, %r48, %r4;
+ setp.lt.u32 %p7, %r48, %r14;
@%p7 bra BB3_8;
BB3_11:
- shl.b32 %r35, %r17, 3;
- mov.u32 %r36, memory;
- add.s32 %r12, %r36, %r35;
- st.shared.f64 [%r12], %fd51;
+ shl.b32 %r26, %r17, 3;
+ mov.u32 %r27, memory;
+ add.s32 %r13, %r27, %r26;
+ st.shared.f64 [%r13], %fd51;
bar.sync 0;
- setp.lt.u32 %p8, %r16, 1024;
+ setp.lt.u32 %p8, %r1, 1024;
@%p8 bra BB3_15;
setp.gt.u32 %p9, %r17, 511;
@%p9 bra BB3_14;
- ld.shared.f64 %fd40, [%r12+4096];
+ ld.shared.f64 %fd40, [%r13+4096];
add.f64 %fd51, %fd51, %fd40;
- st.shared.f64 [%r12], %fd51;
+ st.shared.f64 [%r13], %fd51;
BB3_14:
bar.sync 0;
BB3_15:
- setp.lt.u32 %p10, %r16, 512;
+ setp.lt.u32 %p10, %r1, 512;
@%p10 bra BB3_19;
setp.gt.u32 %p11, %r17, 255;
@%p11 bra BB3_18;
- ld.shared.f64 %fd41, [%r12+2048];
+ ld.shared.f64 %fd41, [%r13+2048];
add.f64 %fd51, %fd51, %fd41;
- st.shared.f64 [%r12], %fd51;
+ st.shared.f64 [%r13], %fd51;
BB3_18:
bar.sync 0;
BB3_19:
- setp.lt.u32 %p12, %r16, 256;
+ setp.lt.u32 %p12, %r1, 256;
@%p12 bra BB3_23;
setp.gt.u32 %p13, %r17, 127;
@%p13 bra BB3_22;
- ld.shared.f64 %fd42, [%r12+1024];
+ ld.shared.f64 %fd42, [%r13+1024];
add.f64 %fd51, %fd51, %fd42;
- st.shared.f64 [%r12], %fd51;
+ st.shared.f64 [%r13], %fd51;
BB3_22:
bar.sync 0;
BB3_23:
- setp.lt.u32 %p14, %r16, 128;
+ setp.lt.u32 %p14, %r1, 128;
@%p14 bra BB3_27;
setp.gt.u32 %p15, %r17, 63;
@%p15 bra BB3_26;
- ld.shared.f64 %fd43, [%r12+512];
+ ld.shared.f64 %fd43, [%r13+512];
add.f64 %fd51, %fd51, %fd43;
- st.shared.f64 [%r12], %fd51;
+ st.shared.f64 [%r13], %fd51;
BB3_26:
bar.sync 0;
@@ -477,72 +517,105 @@ BB3_27:
setp.gt.u32 %p16, %r17, 31;
@%p16 bra BB3_40;
- setp.lt.u32 %p17, %r16, 64;
+ setp.lt.u32 %p17, %r1, 64;
@%p17 bra BB3_30;
- ld.volatile.shared.f64 %fd44, [%r12+256];
+ ld.volatile.shared.f64 %fd44, [%r13+256];
add.f64 %fd51, %fd51, %fd44;
- st.volatile.shared.f64 [%r12], %fd51;
+ st.volatile.shared.f64 [%r13], %fd51;
BB3_30:
- setp.lt.u32 %p18, %r16, 32;
+ setp.lt.u32 %p18, %r1, 32;
@%p18 bra BB3_32;
- ld.volatile.shared.f64 %fd45, [%r12+128];
+ ld.volatile.shared.f64 %fd45, [%r13+128];
add.f64 %fd51, %fd51, %fd45;
- st.volatile.shared.f64 [%r12], %fd51;
+ st.volatile.shared.f64 [%r13], %fd51;
BB3_32:
- setp.lt.u32 %p19, %r16, 16;
+ setp.lt.u32 %p19, %r1, 16;
@%p19 bra BB3_34;
- ld.volatile.shared.f64 %fd46, [%r12+64];
+ ld.volatile.shared.f64 %fd46, [%r13+64];
add.f64 %fd51, %fd51, %fd46;
- st.volatile.shared.f64 [%r12], %fd51;
+ st.volatile.shared.f64 [%r13], %fd51;
BB3_34:
- setp.lt.u32 %p20, %r16, 8;
+ setp.lt.u32 %p20, %r1, 8;
@%p20 bra BB3_36;
- ld.volatile.shared.f64 %fd47, [%r12+32];
+ ld.volatile.shared.f64 %fd47, [%r13+32];
add.f64 %fd51, %fd51, %fd47;
- st.volatile.shared.f64 [%r12], %fd51;
+ st.volatile.shared.f64 [%r13], %fd51;
BB3_36:
- setp.lt.u32 %p21, %r16, 4;
+ setp.lt.u32 %p21, %r1, 4;
@%p21 bra BB3_38;
- ld.volatile.shared.f64 %fd48, [%r12+16];
+ ld.volatile.shared.f64 %fd48, [%r13+16];
add.f64 %fd51, %fd51, %fd48;
- st.volatile.shared.f64 [%r12], %fd51;
+ st.volatile.shared.f64 [%r13], %fd51;
BB3_38:
- setp.lt.u32 %p22, %r16, 2;
+ setp.lt.u32 %p22, %r1, 2;
@%p22 bra BB3_40;
- ld.volatile.shared.f64 %fd49, [%r12+8];
+ ld.volatile.shared.f64 %fd49, [%r13+8];
add.f64 %fd50, %fd51, %fd49;
- st.volatile.shared.f64 [%r12], %fd50;
+ st.volatile.shared.f64 [%r13], %fd50;
BB3_40:
setp.ne.s32 %p23, %r17, 0;
- @%p23 bra BB3_44;
+ @%p23 bra BB3_45;
ld.shared.f64 %fd32, [memory];
- cvta.to.global.u64 %rd28, %rd10;
- ld.global.u64 %rd29, [%rd28+16];
- ld.global.u64 %rd30, [%rd28+32];
- cvta.to.global.u64 %rd35, %rd30;
- setp.ne.s64 %p24, %rd29, 0;
+ cvta.to.global.u64 %rd26, %rd11;
+ add.s64 %rd6, %rd26, 16;
+ ld.global.u64 %rd27, [%rd26+16];
+ setp.eq.s64 %p24, %rd27, 0;
@%p24 bra BB3_43;
- mul.wide.u32 %rd31, %r14, 8;
- add.s64 %rd35, %rd35, %rd31;
+ mov.u32 %r44, 0;
+ add.u64 %rd28, %SP, 0;
+ add.u64 %rd29, %SPL, 0;
+ st.local.u32 [%rd29], %r44;
+ st.local.u32 [%rd29+4], %r15;
+ mov.u64 %rd30, $str;
+ cvta.global.u64 %rd31, %rd30;
+ // Callseq Start 1
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd31;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd28;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r46, [retval0+0];
+
+ //{
+ }// Callseq End 1
+ // inline asm
+ trap;
+ // inline asm
+ ld.global.u64 %rd37, [%rd6+16];
+ bra.uni BB3_44;
BB3_43:
- st.global.f64 [%rd35], %fd32;
+ ld.global.u64 %rd32, [%rd6+16];
+ mul.wide.u32 %rd33, %r15, 8;
+ add.s64 %rd37, %rd32, %rd33;
BB3_44:
+ st.f64 [%rd37], %fd32;
+
+BB3_45:
ret;
}
@@ -553,151 +626,151 @@ BB3_44:
.param .u32 reduce_max_f_param_2
)
{
+ .local .align 8 .b8 __local_depot4[8];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
.reg .pred %p<25>;
.reg .f32 %f<69>;
- .reg .b32 %r<57>;
- .reg .b64 %rd<36>;
-
-
- ld.param.u64 %rd9, [reduce_max_f_param_0];
- ld.param.u64 %rd10, [reduce_max_f_param_1];
- ld.param.u32 %r13, [reduce_max_f_param_2];
- mov.u32 %r14, %ctaid.x;
- shl.b32 %r15, %r14, 1;
- mov.u32 %r16, %ntid.x;
+ .reg .b32 %r<51>;
+ .reg .b64 %rd<38>;
+
+
+ mov.u64 %SPL, __local_depot4;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd10, [reduce_max_f_param_0];
+ ld.param.u64 %rd11, [reduce_max_f_param_1];
+ ld.param.u32 %r14, [reduce_max_f_param_2];
+ mov.u32 %r15, %ctaid.x;
+ shl.b32 %r16, %r15, 1;
+ mov.u32 %r1, %ntid.x;
mov.u32 %r17, %tid.x;
- mad.lo.s32 %r56, %r15, %r16, %r17;
+ mad.lo.s32 %r48, %r16, %r1, %r17;
mov.f32 %f51, 0fFF800000;
- setp.ge.u32 %p1, %r56, %r13;
+ setp.ge.u32 %p1, %r48, %r14;
@%p1 bra BB4_11;
- cvta.to.global.u64 %rd11, %rd9;
- ld.global.u64 %rd1, [%rd11+16];
+ cvta.to.global.u64 %rd12, %rd10;
+ ld.global.u64 %rd1, [%rd12+16];
setp.eq.s64 %p2, %rd1, 0;
- ld.global.u64 %rd12, [%rd11+32];
- cvta.to.global.u64 %rd2, %rd12;
+ ld.global.u64 %rd2, [%rd12+32];
+ mov.u32 %r18, %nctaid.x;
+ mul.lo.s32 %r19, %r1, %r18;
+ shl.b32 %r4, %r19, 1;
mov.f32 %f51, 0fFF800000;
@%p2 bra BB4_8;
- mad.lo.s32 %r54, %r15, %r16, %r17;
- mov.f32 %f51, 0fFF800000;
- mov.u64 %rd32, %rd1;
+ mov.u64 %rd34, %rd1;
BB4_3:
- cvta.to.global.u64 %rd13, %rd32;
- mul.wide.u32 %rd14, %r54, 4;
- add.s64 %rd15, %rd13, %rd14;
- ld.global.u32 %r27, [%rd15];
- mul.wide.u32 %rd16, %r27, 4;
- add.s64 %rd17, %rd2, %rd16;
- ld.global.f32 %f36, [%rd17];
+ mul.wide.u32 %rd13, %r48, 4;
+ add.s64 %rd14, %rd34, %rd13;
+ ld.u32 %r20, [%rd14];
+ mul.wide.u32 %rd15, %r20, 4;
+ add.s64 %rd16, %rd2, %rd15;
+ ld.f32 %f36, [%rd16];
max.f32 %f51, %f51, %f36;
- add.s32 %r55, %r54, %r16;
- setp.ge.u32 %p3, %r55, %r13;
+ add.s32 %r49, %r48, %r1;
+ setp.ge.u32 %p3, %r49, %r14;
@%p3 bra BB4_7;
- setp.eq.s64 %p4, %rd32, 0;
- mov.u64 %rd32, 0;
+ setp.eq.s64 %p4, %rd34, 0;
+ mov.u64 %rd34, 0;
@%p4 bra BB4_6;
- cvta.to.global.u64 %rd19, %rd1;
- mul.wide.u32 %rd20, %r55, 4;
- add.s64 %rd21, %rd19, %rd20;
- ld.global.u32 %r55, [%rd21];
- mov.u64 %rd32, %rd1;
+ mul.wide.u32 %rd18, %r49, 4;
+ add.s64 %rd19, %rd1, %rd18;
+ ld.u32 %r49, [%rd19];
+ mov.u64 %rd34, %rd1;
BB4_6:
- mul.wide.u32 %rd22, %r55, 4;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.f32 %f37, [%rd23];
+ mul.wide.u32 %rd20, %r49, 4;
+ add.s64 %rd21, %rd2, %rd20;
+ ld.f32 %f37, [%rd21];
max.f32 %f51, %f51, %f37;
BB4_7:
- shl.b32 %r30, %r16, 1;
- mov.u32 %r31, %nctaid.x;
- mad.lo.s32 %r54, %r30, %r31, %r54;
- setp.lt.u32 %p5, %r54, %r13;
+ shl.b32 %r23, %r1, 1;
+ mad.lo.s32 %r48, %r23, %r18, %r48;
+ setp.lt.u32 %p5, %r48, %r14;
@%p5 bra BB4_3;
bra.uni BB4_11;
BB4_8:
- mul.wide.u32 %rd24, %r56, 4;
- add.s64 %rd25, %rd2, %rd24;
- ld.global.f32 %f38, [%rd25];
+ mul.wide.u32 %rd22, %r48, 4;
+ add.s64 %rd23, %rd2, %rd22;
+ ld.f32 %f38, [%rd23];
max.f32 %f51, %f51, %f38;
- add.s32 %r10, %r56, %r16;
- setp.ge.u32 %p6, %r10, %r13;
+ add.s32 %r11, %r48, %r1;
+ setp.ge.u32 %p6, %r11, %r14;
@%p6 bra BB4_10;
- mul.wide.u32 %rd26, %r10, 4;
- add.s64 %rd27, %rd2, %rd26;
- ld.global.f32 %f39, [%rd27];
+ mul.wide.u32 %rd24, %r11, 4;
+ add.s64 %rd25, %rd2, %rd24;
+ ld.f32 %f39, [%rd25];
max.f32 %f51, %f51, %f39;
BB4_10:
- mov.u32 %r32, %nctaid.x;
- shl.b32 %r33, %r16, 1;
- mad.lo.s32 %r56, %r33, %r32, %r56;
- setp.lt.u32 %p7, %r56, %r13;
+ add.s32 %r48, %r48, %r4;
+ setp.lt.u32 %p7, %r48, %r14;
@%p7 bra BB4_8;
BB4_11:
- shl.b32 %r35, %r17, 2;
- mov.u32 %r36, memory;
- add.s32 %r12, %r36, %r35;
- st.shared.f32 [%r12], %f51;
+ shl.b32 %r26, %r17, 2;
+ mov.u32 %r27, memory;
+ add.s32 %r13, %r27, %r26;
+ st.shared.f32 [%r13], %f51;
bar.sync 0;
- setp.lt.u32 %p8, %r16, 1024;
+ setp.lt.u32 %p8, %r1, 1024;
@%p8 bra BB4_15;
setp.gt.u32 %p9, %r17, 511;
@%p9 bra BB4_14;
- ld.shared.f32 %f40, [%r12+2048];
+ ld.shared.f32 %f40, [%r13+2048];
max.f32 %f51, %f51, %f40;
- st.shared.f32 [%r12], %f51;
+ st.shared.f32 [%r13], %f51;
BB4_14:
bar.sync 0;
BB4_15:
- setp.lt.u32 %p10, %r16, 512;
+ setp.lt.u32 %p10, %r1, 512;
@%p10 bra BB4_19;
setp.gt.u32 %p11, %r17, 255;
@%p11 bra BB4_18;
- ld.shared.f32 %f41, [%r12+1024];
+ ld.shared.f32 %f41, [%r13+1024];
max.f32 %f51, %f51, %f41;
- st.shared.f32 [%r12], %f51;
+ st.shared.f32 [%r13], %f51;
BB4_18:
bar.sync 0;
BB4_19:
- setp.lt.u32 %p12, %r16, 256;
+ setp.lt.u32 %p12, %r1, 256;
@%p12 bra BB4_23;
setp.gt.u32 %p13, %r17, 127;
@%p13 bra BB4_22;
- ld.shared.f32 %f42, [%r12+512];
+ ld.shared.f32 %f42, [%r13+512];
max.f32 %f51, %f51, %f42;
- st.shared.f32 [%r12], %f51;
+ st.shared.f32 [%r13], %f51;
BB4_22:
bar.sync 0;
BB4_23:
- setp.lt.u32 %p14, %r16, 128;
+ setp.lt.u32 %p14, %r1, 128;
@%p14 bra BB4_27;
setp.gt.u32 %p15, %r17, 63;
@%p15 bra BB4_26;
- ld.shared.f32 %f43, [%r12+256];
+ ld.shared.f32 %f43, [%r13+256];
max.f32 %f51, %f51, %f43;
- st.shared.f32 [%r12], %f51;
+ st.shared.f32 [%r13], %f51;
BB4_26:
bar.sync 0;
@@ -706,72 +779,105 @@ BB4_27:
setp.gt.u32 %p16, %r17, 31;
@%p16 bra BB4_40;
- setp.lt.u32 %p17, %r16, 64;
+ setp.lt.u32 %p17, %r1, 64;
@%p17 bra BB4_30;
- ld.volatile.shared.f32 %f44, [%r12+128];
+ ld.volatile.shared.f32 %f44, [%r13+128];
max.f32 %f51, %f51, %f44;
- st.volatile.shared.f32 [%r12], %f51;
+ st.volatile.shared.f32 [%r13], %f51;
BB4_30:
- setp.lt.u32 %p18, %r16, 32;
+ setp.lt.u32 %p18, %r1, 32;
@%p18 bra BB4_32;
- ld.volatile.shared.f32 %f45, [%r12+64];
+ ld.volatile.shared.f32 %f45, [%r13+64];
max.f32 %f51, %f51, %f45;
- st.volatile.shared.f32 [%r12], %f51;
+ st.volatile.shared.f32 [%r13], %f51;
BB4_32:
- setp.lt.u32 %p19, %r16, 16;
+ setp.lt.u32 %p19, %r1, 16;
@%p19 bra BB4_34;
- ld.volatile.shared.f32 %f46, [%r12+32];
+ ld.volatile.shared.f32 %f46, [%r13+32];
max.f32 %f51, %f51, %f46;
- st.volatile.shared.f32 [%r12], %f51;
+ st.volatile.shared.f32 [%r13], %f51;
BB4_34:
- setp.lt.u32 %p20, %r16, 8;
+ setp.lt.u32 %p20, %r1, 8;
@%p20 bra BB4_36;
- ld.volatile.shared.f32 %f47, [%r12+16];
+ ld.volatile.shared.f32 %f47, [%r13+16];
max.f32 %f51, %f51, %f47;
- st.volatile.shared.f32 [%r12], %f51;
+ st.volatile.shared.f32 [%r13], %f51;
BB4_36:
- setp.lt.u32 %p21, %r16, 4;
+ setp.lt.u32 %p21, %r1, 4;
@%p21 bra BB4_38;
- ld.volatile.shared.f32 %f48, [%r12+8];
+ ld.volatile.shared.f32 %f48, [%r13+8];
max.f32 %f51, %f51, %f48;
- st.volatile.shared.f32 [%r12], %f51;
+ st.volatile.shared.f32 [%r13], %f51;
BB4_38:
- setp.lt.u32 %p22, %r16, 2;
+ setp.lt.u32 %p22, %r1, 2;
@%p22 bra BB4_40;
- ld.volatile.shared.f32 %f49, [%r12+4];
+ ld.volatile.shared.f32 %f49, [%r13+4];
max.f32 %f50, %f51, %f49;
- st.volatile.shared.f32 [%r12], %f50;
+ st.volatile.shared.f32 [%r13], %f50;
BB4_40:
setp.ne.s32 %p23, %r17, 0;
- @%p23 bra BB4_44;
+ @%p23 bra BB4_45;
ld.shared.f32 %f32, [memory];
- cvta.to.global.u64 %rd28, %rd10;
- ld.global.u64 %rd29, [%rd28+16];
- ld.global.u64 %rd30, [%rd28+32];
- cvta.to.global.u64 %rd35, %rd30;
- setp.ne.s64 %p24, %rd29, 0;
+ cvta.to.global.u64 %rd26, %rd11;
+ add.s64 %rd6, %rd26, 16;
+ ld.global.u64 %rd27, [%rd26+16];
+ setp.eq.s64 %p24, %rd27, 0;
@%p24 bra BB4_43;
- mul.wide.u32 %rd31, %r14, 4;
- add.s64 %rd35, %rd35, %rd31;
+ mov.u32 %r44, 0;
+ add.u64 %rd28, %SP, 0;
+ add.u64 %rd29, %SPL, 0;
+ st.local.u32 [%rd29], %r44;
+ st.local.u32 [%rd29+4], %r15;
+ mov.u64 %rd30, $str;
+ cvta.global.u64 %rd31, %rd30;
+ // Callseq Start 2
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd31;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd28;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r46, [retval0+0];
+
+ //{
+ }// Callseq End 2
+ // inline asm
+ trap;
+ // inline asm
+ ld.global.u64 %rd37, [%rd6+16];
+ bra.uni BB4_44;
BB4_43:
- st.global.f32 [%rd35], %f32;
+ ld.global.u64 %rd32, [%rd6+16];
+ mul.wide.u32 %rd33, %r15, 4;
+ add.s64 %rd37, %rd32, %rd33;
BB4_44:
+ st.f32 [%rd37], %f32;
+
+BB4_45:
ret;
}
@@ -782,85 +888,87 @@ BB4_44:
.param .u32 reduce_max_d_param_2
)
{
+ .local .align 8 .b8 __local_depot5[8];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
.reg .pred %p<23>;
- .reg .b32 %r<46>;
+ .reg .b32 %r<49>;
.reg .f64 %fd<60>;
- .reg .b64 %rd<34>;
+ .reg .b64 %rd<36>;
- ld.param.u64 %rd10, [reduce_max_d_param_0];
- ld.param.u64 %rd11, [reduce_max_d_param_1];
+ mov.u64 %SPL, __local_depot5;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd11, [reduce_max_d_param_0];
+ ld.param.u64 %rd12, [reduce_max_d_param_1];
ld.param.u32 %r10, [reduce_max_d_param_2];
- mov.u32 %r11, %tid.x;
- mov.u32 %r12, %ctaid.x;
- shl.b32 %r13, %r12, 1;
- mov.u32 %r14, %ntid.x;
- mad.lo.s32 %r43, %r13, %r14, %r11;
+ mov.u32 %r11, %ctaid.x;
+ shl.b32 %r12, %r11, 1;
+ mov.u32 %r13, %ntid.x;
+ mov.u32 %r14, %tid.x;
+ mad.lo.s32 %r46, %r12, %r13, %r14;
mov.f64 %fd44, 0dFFF0000000000000;
- setp.ge.u32 %p1, %r43, %r10;
+ setp.ge.u32 %p1, %r46, %r10;
@%p1 bra BB5_9;
- cvta.to.global.u64 %rd12, %rd10;
- ld.global.u64 %rd1, [%rd12+16];
- ld.global.u64 %rd13, [%rd12+32];
- cvta.to.global.u64 %rd2, %rd13;
+ cvta.to.global.u64 %rd13, %rd11;
+ ld.global.u64 %rd1, [%rd13+16];
+ ld.global.u64 %rd2, [%rd13+32];
mov.f64 %fd44, 0dFFF0000000000000;
- mov.u64 %rd30, %rd1;
+ mov.u64 %rd32, %rd1;
BB5_2:
setp.eq.s64 %p2, %rd1, 0;
- mov.u32 %r44, %r43;
+ mov.u32 %r47, %r46;
@%p2 bra BB5_4;
- cvta.to.global.u64 %rd14, %rd1;
- mul.wide.u32 %rd15, %r43, 4;
- add.s64 %rd16, %rd14, %rd15;
- ld.global.u32 %r44, [%rd16];
- mov.u64 %rd30, %rd1;
+ mul.wide.u32 %rd14, %r46, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ ld.u32 %r47, [%rd15];
+ mov.u64 %rd32, %rd1;
BB5_4:
- mul.wide.u32 %rd17, %r44, 8;
- add.s64 %rd18, %rd2, %rd17;
- ld.global.f64 %fd31, [%rd18];
+ mul.wide.u32 %rd16, %r47, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.f64 %fd31, [%rd17];
max.f64 %fd44, %fd44, %fd31;
- add.s32 %r45, %r43, %r14;
- setp.ge.u32 %p3, %r45, %r10;
+ add.s32 %r48, %r46, %r13;
+ setp.ge.u32 %p3, %r48, %r10;
@%p3 bra BB5_8;
- setp.eq.s64 %p4, %rd30, 0;
- mov.u64 %rd30, 0;
+ setp.eq.s64 %p4, %rd32, 0;
+ mov.u64 %rd32, 0;
@%p4 bra BB5_7;
- cvta.to.global.u64 %rd20, %rd1;
- add.s32 %r19, %r43, %r14;
- mul.wide.u32 %rd21, %r19, 4;
- add.s64 %rd22, %rd20, %rd21;
- ld.global.u32 %r45, [%rd22];
- mov.u64 %rd30, %rd1;
+ add.s32 %r19, %r46, %r13;
+ mul.wide.u32 %rd19, %r19, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ ld.u32 %r48, [%rd20];
+ mov.u64 %rd32, %rd1;
BB5_7:
- mul.wide.u32 %rd23, %r45, 8;
- add.s64 %rd24, %rd2, %rd23;
- ld.global.f64 %fd32, [%rd24];
+ mul.wide.u32 %rd21, %r48, 8;
+ add.s64 %rd22, %rd2, %rd21;
+ ld.f64 %fd32, [%rd22];
max.f64 %fd44, %fd44, %fd32;
BB5_8:
- shl.b32 %r21, %r14, 1;
+ shl.b32 %r21, %r13, 1;
mov.u32 %r22, %nctaid.x;
- mad.lo.s32 %r43, %r21, %r22, %r43;
- setp.lt.u32 %p5, %r43, %r10;
+ mad.lo.s32 %r46, %r21, %r22, %r46;
+ setp.lt.u32 %p5, %r46, %r10;
@%p5 bra BB5_2;
BB5_9:
- shl.b32 %r24, %r11, 3;
+ shl.b32 %r24, %r14, 3;
mov.u32 %r25, memory;
add.s32 %r9, %r25, %r24;
st.shared.f64 [%r9], %fd44;
bar.sync 0;
- setp.lt.u32 %p6, %r14, 1024;
+ setp.lt.u32 %p6, %r13, 1024;
@%p6 bra BB5_13;
- setp.gt.u32 %p7, %r11, 511;
+ setp.gt.u32 %p7, %r14, 511;
@%p7 bra BB5_12;
ld.shared.f64 %fd33, [%r9+4096];
@@ -871,10 +979,10 @@ BB5_12:
bar.sync 0;
BB5_13:
- setp.lt.u32 %p8, %r14, 512;
+ setp.lt.u32 %p8, %r13, 512;
@%p8 bra BB5_17;
- setp.gt.u32 %p9, %r11, 255;
+ setp.gt.u32 %p9, %r14, 255;
@%p9 bra BB5_16;
ld.shared.f64 %fd34, [%r9+2048];
@@ -885,10 +993,10 @@ BB5_16:
bar.sync 0;
BB5_17:
- setp.lt.u32 %p10, %r14, 256;
+ setp.lt.u32 %p10, %r13, 256;
@%p10 bra BB5_21;
- setp.gt.u32 %p11, %r11, 127;
+ setp.gt.u32 %p11, %r14, 127;
@%p11 bra BB5_20;
ld.shared.f64 %fd35, [%r9+1024];
@@ -899,10 +1007,10 @@ BB5_20:
bar.sync 0;
BB5_21:
- setp.lt.u32 %p12, %r14, 128;
+ setp.lt.u32 %p12, %r13, 128;
@%p12 bra BB5_25;
- setp.gt.u32 %p13, %r11, 63;
+ setp.gt.u32 %p13, %r14, 63;
@%p13 bra BB5_24;
ld.shared.f64 %fd36, [%r9+512];
@@ -913,10 +1021,10 @@ BB5_24:
bar.sync 0;
BB5_25:
- setp.gt.u32 %p14, %r11, 31;
+ setp.gt.u32 %p14, %r14, 31;
@%p14 bra BB5_38;
- setp.lt.u32 %p15, %r14, 64;
+ setp.lt.u32 %p15, %r13, 64;
@%p15 bra BB5_28;
ld.volatile.shared.f64 %fd37, [%r9+256];
@@ -924,7 +1032,7 @@ BB5_25:
st.volatile.shared.f64 [%r9], %fd44;
BB5_28:
- setp.lt.u32 %p16, %r14, 32;
+ setp.lt.u32 %p16, %r13, 32;
@%p16 bra BB5_30;
ld.volatile.shared.f64 %fd38, [%r9+128];
@@ -932,7 +1040,7 @@ BB5_28:
st.volatile.shared.f64 [%r9], %fd44;
BB5_30:
- setp.lt.u32 %p17, %r14, 16;
+ setp.lt.u32 %p17, %r13, 16;
@%p17 bra BB5_32;
ld.volatile.shared.f64 %fd39, [%r9+64];
@@ -940,7 +1048,7 @@ BB5_30:
st.volatile.shared.f64 [%r9], %fd44;
BB5_32:
- setp.lt.u32 %p18, %r14, 8;
+ setp.lt.u32 %p18, %r13, 8;
@%p18 bra BB5_34;
ld.volatile.shared.f64 %fd40, [%r9+32];
@@ -948,7 +1056,7 @@ BB5_32:
st.volatile.shared.f64 [%r9], %fd44;
BB5_34:
- setp.lt.u32 %p19, %r14, 4;
+ setp.lt.u32 %p19, %r13, 4;
@%p19 bra BB5_36;
ld.volatile.shared.f64 %fd41, [%r9+16];
@@ -956,7 +1064,7 @@ BB5_34:
st.volatile.shared.f64 [%r9], %fd44;
BB5_36:
- setp.lt.u32 %p20, %r14, 2;
+ setp.lt.u32 %p20, %r13, 2;
@%p20 bra BB5_38;
ld.volatile.shared.f64 %fd42, [%r9+8];
@@ -964,24 +1072,57 @@ BB5_36:
st.volatile.shared.f64 [%r9], %fd43;
BB5_38:
- setp.ne.s32 %p21, %r11, 0;
- @%p21 bra BB5_42;
+ setp.ne.s32 %p21, %r14, 0;
+ @%p21 bra BB5_43;
ld.shared.f64 %fd28, [memory];
- cvta.to.global.u64 %rd25, %rd11;
- ld.global.u64 %rd26, [%rd25+16];
- ld.global.u64 %rd27, [%rd25+32];
- cvta.to.global.u64 %rd33, %rd27;
- setp.ne.s64 %p22, %rd26, 0;
+ cvta.to.global.u64 %rd23, %rd12;
+ add.s64 %rd7, %rd23, 16;
+ ld.global.u64 %rd24, [%rd23+16];
+ setp.eq.s64 %p22, %rd24, 0;
@%p22 bra BB5_41;
- mul.wide.u32 %rd28, %r12, 8;
- add.s64 %rd33, %rd33, %rd28;
+ mov.u32 %r42, 0;
+ add.u64 %rd25, %SP, 0;
+ add.u64 %rd26, %SPL, 0;
+ st.local.u32 [%rd26], %r42;
+ st.local.u32 [%rd26+4], %r11;
+ mov.u64 %rd27, $str;
+ cvta.global.u64 %rd28, %rd27;
+ // Callseq Start 3
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd28;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd25;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r44, [retval0+0];
+
+ //{
+ }// Callseq End 3
+ // inline asm
+ trap;
+ // inline asm
+ ld.global.u64 %rd35, [%rd7+16];
+ bra.uni BB5_42;
BB5_41:
- st.global.f64 [%rd33], %fd28;
+ ld.global.u64 %rd29, [%rd7+16];
+ mul.wide.u32 %rd30, %r11, 8;
+ add.s64 %rd35, %rd29, %rd30;
BB5_42:
+ st.f64 [%rd35], %fd28;
+
+BB5_43:
ret;
}
@@ -992,85 +1133,87 @@ BB5_42:
.param .u32 reduce_min_f_param_2
)
{
+ .local .align 8 .b8 __local_depot6[8];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
.reg .pred %p<23>;
.reg .f32 %f<60>;
- .reg .b32 %r<46>;
- .reg .b64 %rd<34>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<36>;
- ld.param.u64 %rd10, [reduce_min_f_param_0];
- ld.param.u64 %rd11, [reduce_min_f_param_1];
+ mov.u64 %SPL, __local_depot6;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd11, [reduce_min_f_param_0];
+ ld.param.u64 %rd12, [reduce_min_f_param_1];
ld.param.u32 %r10, [reduce_min_f_param_2];
- mov.u32 %r11, %tid.x;
- mov.u32 %r12, %ctaid.x;
- shl.b32 %r13, %r12, 1;
- mov.u32 %r14, %ntid.x;
- mad.lo.s32 %r43, %r13, %r14, %r11;
+ mov.u32 %r11, %ctaid.x;
+ shl.b32 %r12, %r11, 1;
+ mov.u32 %r13, %ntid.x;
+ mov.u32 %r14, %tid.x;
+ mad.lo.s32 %r46, %r12, %r13, %r14;
mov.f32 %f44, 0f7F800000;
- setp.ge.u32 %p1, %r43, %r10;
+ setp.ge.u32 %p1, %r46, %r10;
@%p1 bra BB6_9;
- cvta.to.global.u64 %rd12, %rd10;
- ld.global.u64 %rd1, [%rd12+16];
- ld.global.u64 %rd13, [%rd12+32];
- cvta.to.global.u64 %rd2, %rd13;
+ cvta.to.global.u64 %rd13, %rd11;
+ ld.global.u64 %rd1, [%rd13+16];
+ ld.global.u64 %rd2, [%rd13+32];
mov.f32 %f44, 0f7F800000;
- mov.u64 %rd30, %rd1;
+ mov.u64 %rd32, %rd1;
BB6_2:
setp.eq.s64 %p2, %rd1, 0;
- mov.u32 %r44, %r43;
+ mov.u32 %r47, %r46;
@%p2 bra BB6_4;
- cvta.to.global.u64 %rd14, %rd1;
- mul.wide.u32 %rd15, %r43, 4;
- add.s64 %rd16, %rd14, %rd15;
- ld.global.u32 %r44, [%rd16];
- mov.u64 %rd30, %rd1;
+ mul.wide.u32 %rd14, %r46, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ ld.u32 %r47, [%rd15];
+ mov.u64 %rd32, %rd1;
BB6_4:
- mul.wide.u32 %rd17, %r44, 4;
- add.s64 %rd18, %rd2, %rd17;
- ld.global.f32 %f31, [%rd18];
+ mul.wide.u32 %rd16, %r47, 4;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.f32 %f31, [%rd17];
min.f32 %f44, %f44, %f31;
- add.s32 %r45, %r43, %r14;
- setp.ge.u32 %p3, %r45, %r10;
+ add.s32 %r48, %r46, %r13;
+ setp.ge.u32 %p3, %r48, %r10;
@%p3 bra BB6_8;
- setp.eq.s64 %p4, %rd30, 0;
- mov.u64 %rd30, 0;
+ setp.eq.s64 %p4, %rd32, 0;
+ mov.u64 %rd32, 0;
@%p4 bra BB6_7;
- cvta.to.global.u64 %rd20, %rd1;
- add.s32 %r19, %r43, %r14;
- mul.wide.u32 %rd21, %r19, 4;
- add.s64 %rd22, %rd20, %rd21;
- ld.global.u32 %r45, [%rd22];
- mov.u64 %rd30, %rd1;
+ add.s32 %r19, %r46, %r13;
+ mul.wide.u32 %rd19, %r19, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ ld.u32 %r48, [%rd20];
+ mov.u64 %rd32, %rd1;
BB6_7:
- mul.wide.u32 %rd23, %r45, 4;
- add.s64 %rd24, %rd2, %rd23;
- ld.global.f32 %f32, [%rd24];
+ mul.wide.u32 %rd21, %r48, 4;
+ add.s64 %rd22, %rd2, %rd21;
+ ld.f32 %f32, [%rd22];
min.f32 %f44, %f44, %f32;
BB6_8:
- shl.b32 %r21, %r14, 1;
+ shl.b32 %r21, %r13, 1;
mov.u32 %r22, %nctaid.x;
- mad.lo.s32 %r43, %r21, %r22, %r43;
- setp.lt.u32 %p5, %r43, %r10;
+ mad.lo.s32 %r46, %r21, %r22, %r46;
+ setp.lt.u32 %p5, %r46, %r10;
@%p5 bra BB6_2;
BB6_9:
- shl.b32 %r24, %r11, 2;
+ shl.b32 %r24, %r14, 2;
mov.u32 %r25, memory;
add.s32 %r9, %r25, %r24;
st.shared.f32 [%r9], %f44;
bar.sync 0;
- setp.lt.u32 %p6, %r14, 1024;
+ setp.lt.u32 %p6, %r13, 1024;
@%p6 bra BB6_13;
- setp.gt.u32 %p7, %r11, 511;
+ setp.gt.u32 %p7, %r14, 511;
@%p7 bra BB6_12;
ld.shared.f32 %f33, [%r9+2048];
@@ -1081,10 +1224,10 @@ BB6_12:
bar.sync 0;
BB6_13:
- setp.lt.u32 %p8, %r14, 512;
+ setp.lt.u32 %p8, %r13, 512;
@%p8 bra BB6_17;
- setp.gt.u32 %p9, %r11, 255;
+ setp.gt.u32 %p9, %r14, 255;
@%p9 bra BB6_16;
ld.shared.f32 %f34, [%r9+1024];
@@ -1095,10 +1238,10 @@ BB6_16:
bar.sync 0;
BB6_17:
- setp.lt.u32 %p10, %r14, 256;
+ setp.lt.u32 %p10, %r13, 256;
@%p10 bra BB6_21;
- setp.gt.u32 %p11, %r11, 127;
+ setp.gt.u32 %p11, %r14, 127;
@%p11 bra BB6_20;
ld.shared.f32 %f35, [%r9+512];
@@ -1109,10 +1252,10 @@ BB6_20:
bar.sync 0;
BB6_21:
- setp.lt.u32 %p12, %r14, 128;
+ setp.lt.u32 %p12, %r13, 128;
@%p12 bra BB6_25;
- setp.gt.u32 %p13, %r11, 63;
+ setp.gt.u32 %p13, %r14, 63;
@%p13 bra BB6_24;
ld.shared.f32 %f36, [%r9+256];
@@ -1123,10 +1266,10 @@ BB6_24:
bar.sync 0;
BB6_25:
- setp.gt.u32 %p14, %r11, 31;
+ setp.gt.u32 %p14, %r14, 31;
@%p14 bra BB6_38;
- setp.lt.u32 %p15, %r14, 64;
+ setp.lt.u32 %p15, %r13, 64;
@%p15 bra BB6_28;
ld.volatile.shared.f32 %f37, [%r9+128];
@@ -1134,7 +1277,7 @@ BB6_25:
st.volatile.shared.f32 [%r9], %f44;
BB6_28:
- setp.lt.u32 %p16, %r14, 32;
+ setp.lt.u32 %p16, %r13, 32;
@%p16 bra BB6_30;
ld.volatile.shared.f32 %f38, [%r9+64];
@@ -1142,7 +1285,7 @@ BB6_28:
st.volatile.shared.f32 [%r9], %f44;
BB6_30:
- setp.lt.u32 %p17, %r14, 16;
+ setp.lt.u32 %p17, %r13, 16;
@%p17 bra BB6_32;
ld.volatile.shared.f32 %f39, [%r9+32];
@@ -1150,7 +1293,7 @@ BB6_30:
st.volatile.shared.f32 [%r9], %f44;
BB6_32:
- setp.lt.u32 %p18, %r14, 8;
+ setp.lt.u32 %p18, %r13, 8;
@%p18 bra BB6_34;
ld.volatile.shared.f32 %f40, [%r9+16];
@@ -1158,7 +1301,7 @@ BB6_32:
st.volatile.shared.f32 [%r9], %f44;
BB6_34:
- setp.lt.u32 %p19, %r14, 4;
+ setp.lt.u32 %p19, %r13, 4;
@%p19 bra BB6_36;
ld.volatile.shared.f32 %f41, [%r9+8];
@@ -1166,7 +1309,7 @@ BB6_34:
st.volatile.shared.f32 [%r9], %f44;
BB6_36:
- setp.lt.u32 %p20, %r14, 2;
+ setp.lt.u32 %p20, %r13, 2;
@%p20 bra BB6_38;
ld.volatile.shared.f32 %f42, [%r9+4];
@@ -1174,24 +1317,57 @@ BB6_36:
st.volatile.shared.f32 [%r9], %f43;
BB6_38:
- setp.ne.s32 %p21, %r11, 0;
- @%p21 bra BB6_42;
+ setp.ne.s32 %p21, %r14, 0;
+ @%p21 bra BB6_43;
ld.shared.f32 %f28, [memory];
- cvta.to.global.u64 %rd25, %rd11;
- ld.global.u64 %rd26, [%rd25+16];
- ld.global.u64 %rd27, [%rd25+32];
- cvta.to.global.u64 %rd33, %rd27;
- setp.ne.s64 %p22, %rd26, 0;
+ cvta.to.global.u64 %rd23, %rd12;
+ add.s64 %rd7, %rd23, 16;
+ ld.global.u64 %rd24, [%rd23+16];
+ setp.eq.s64 %p22, %rd24, 0;
@%p22 bra BB6_41;
- mul.wide.u32 %rd28, %r12, 4;
- add.s64 %rd33, %rd33, %rd28;
+ mov.u32 %r42, 0;
+ add.u64 %rd25, %SP, 0;
+ add.u64 %rd26, %SPL, 0;
+ st.local.u32 [%rd26], %r42;
+ st.local.u32 [%rd26+4], %r11;
+ mov.u64 %rd27, $str;
+ cvta.global.u64 %rd28, %rd27;
+ // Callseq Start 4
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd28;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd25;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r44, [retval0+0];
+
+ //{
+ }// Callseq End 4
+ // inline asm
+ trap;
+ // inline asm
+ ld.global.u64 %rd35, [%rd7+16];
+ bra.uni BB6_42;
BB6_41:
- st.global.f32 [%rd33], %f28;
+ ld.global.u64 %rd29, [%rd7+16];
+ mul.wide.u32 %rd30, %r11, 4;
+ add.s64 %rd35, %rd29, %rd30;
BB6_42:
+ st.f32 [%rd35], %f28;
+
+BB6_43:
ret;
}
@@ -1202,85 +1378,87 @@ BB6_42:
.param .u32 reduce_min_d_param_2
)
{
+ .local .align 8 .b8 __local_depot7[8];
+ .reg .b64 %SP;
+ .reg .b64 %SPL;
.reg .pred %p<23>;
- .reg .b32 %r<46>;
+ .reg .b32 %r<49>;
.reg .f64 %fd<60>;
- .reg .b64 %rd<34>;
+ .reg .b64 %rd<36>;
- ld.param.u64 %rd10, [reduce_min_d_param_0];
- ld.param.u64 %rd11, [reduce_min_d_param_1];
+ mov.u64 %SPL, __local_depot7;
+ cvta.local.u64 %SP, %SPL;
+ ld.param.u64 %rd11, [reduce_min_d_param_0];
+ ld.param.u64 %rd12, [reduce_min_d_param_1];
ld.param.u32 %r10, [reduce_min_d_param_2];
- mov.u32 %r11, %tid.x;
- mov.u32 %r12, %ctaid.x;
- shl.b32 %r13, %r12, 1;
- mov.u32 %r14, %ntid.x;
- mad.lo.s32 %r43, %r13, %r14, %r11;
+ mov.u32 %r11, %ctaid.x;
+ shl.b32 %r12, %r11, 1;
+ mov.u32 %r13, %ntid.x;
+ mov.u32 %r14, %tid.x;
+ mad.lo.s32 %r46, %r12, %r13, %r14;
mov.f64 %fd44, 0d7FF0000000000000;
- setp.ge.u32 %p1, %r43, %r10;
+ setp.ge.u32 %p1, %r46, %r10;
@%p1 bra BB7_9;
- cvta.to.global.u64 %rd12, %rd10;
- ld.global.u64 %rd1, [%rd12+16];
- ld.global.u64 %rd13, [%rd12+32];
- cvta.to.global.u64 %rd2, %rd13;
+ cvta.to.global.u64 %rd13, %rd11;
+ ld.global.u64 %rd1, [%rd13+16];
+ ld.global.u64 %rd2, [%rd13+32];
mov.f64 %fd44, 0d7FF0000000000000;
- mov.u64 %rd30, %rd1;
+ mov.u64 %rd32, %rd1;
BB7_2:
setp.eq.s64 %p2, %rd1, 0;
- mov.u32 %r44, %r43;
+ mov.u32 %r47, %r46;
@%p2 bra BB7_4;
- cvta.to.global.u64 %rd14, %rd1;
- mul.wide.u32 %rd15, %r43, 4;
- add.s64 %rd16, %rd14, %rd15;
- ld.global.u32 %r44, [%rd16];
- mov.u64 %rd30, %rd1;
+ mul.wide.u32 %rd14, %r46, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ ld.u32 %r47, [%rd15];
+ mov.u64 %rd32, %rd1;
BB7_4:
- mul.wide.u32 %rd17, %r44, 8;
- add.s64 %rd18, %rd2, %rd17;
- ld.global.f64 %fd31, [%rd18];
+ mul.wide.u32 %rd16, %r47, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.f64 %fd31, [%rd17];
min.f64 %fd44, %fd44, %fd31;
- add.s32 %r45, %r43, %r14;
- setp.ge.u32 %p3, %r45, %r10;
+ add.s32 %r48, %r46, %r13;
+ setp.ge.u32 %p3, %r48, %r10;
@%p3 bra BB7_8;
- setp.eq.s64 %p4, %rd30, 0;
- mov.u64 %rd30, 0;
+ setp.eq.s64 %p4, %rd32, 0;
+ mov.u64 %rd32, 0;
@%p4 bra BB7_7;
- cvta.to.global.u64 %rd20, %rd1;
- add.s32 %r19, %r43, %r14;
- mul.wide.u32 %rd21, %r19, 4;
- add.s64 %rd22, %rd20, %rd21;
- ld.global.u32 %r45, [%rd22];
- mov.u64 %rd30, %rd1;
+ add.s32 %r19, %r46, %r13;
+ mul.wide.u32 %rd19, %r19, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ ld.u32 %r48, [%rd20];
+ mov.u64 %rd32, %rd1;
BB7_7:
- mul.wide.u32 %rd23, %r45, 8;
- add.s64 %rd24, %rd2, %rd23;
- ld.global.f64 %fd32, [%rd24];
+ mul.wide.u32 %rd21, %r48, 8;
+ add.s64 %rd22, %rd2, %rd21;
+ ld.f64 %fd32, [%rd22];
min.f64 %fd44, %fd44, %fd32;
BB7_8:
- shl.b32 %r21, %r14, 1;
+ shl.b32 %r21, %r13, 1;
mov.u32 %r22, %nctaid.x;
- mad.lo.s32 %r43, %r21, %r22, %r43;
- setp.lt.u32 %p5, %r43, %r10;
+ mad.lo.s32 %r46, %r21, %r22, %r46;
+ setp.lt.u32 %p5, %r46, %r10;
@%p5 bra BB7_2;
BB7_9:
- shl.b32 %r24, %r11, 3;
+ shl.b32 %r24, %r14, 3;
mov.u32 %r25, memory;
add.s32 %r9, %r25, %r24;
st.shared.f64 [%r9], %fd44;
bar.sync 0;
- setp.lt.u32 %p6, %r14, 1024;
+ setp.lt.u32 %p6, %r13, 1024;
@%p6 bra BB7_13;
- setp.gt.u32 %p7, %r11, 511;
+ setp.gt.u32 %p7, %r14, 511;
@%p7 bra BB7_12;
ld.shared.f64 %fd33, [%r9+4096];
@@ -1291,10 +1469,10 @@ BB7_12:
bar.sync 0;
BB7_13:
- setp.lt.u32 %p8, %r14, 512;
+ setp.lt.u32 %p8, %r13, 512;
@%p8 bra BB7_17;
- setp.gt.u32 %p9, %r11, 255;
+ setp.gt.u32 %p9, %r14, 255;
@%p9 bra BB7_16;
ld.shared.f64 %fd34, [%r9+2048];
@@ -1305,10 +1483,10 @@ BB7_16:
bar.sync 0;
BB7_17:
- setp.lt.u32 %p10, %r14, 256;
+ setp.lt.u32 %p10, %r13, 256;
@%p10 bra BB7_21;
- setp.gt.u32 %p11, %r11, 127;
+ setp.gt.u32 %p11, %r14, 127;
@%p11 bra BB7_20;
ld.shared.f64 %fd35, [%r9+1024];
@@ -1319,10 +1497,10 @@ BB7_20:
bar.sync 0;
BB7_21:
- setp.lt.u32 %p12, %r14, 128;
+ setp.lt.u32 %p12, %r13, 128;
@%p12 bra BB7_25;
- setp.gt.u32 %p13, %r11, 63;
+ setp.gt.u32 %p13, %r14, 63;
@%p13 bra BB7_24;
ld.shared.f64 %fd36, [%r9+512];
@@ -1333,10 +1511,10 @@ BB7_24:
bar.sync 0;
BB7_25:
- setp.gt.u32 %p14, %r11, 31;
+ setp.gt.u32 %p14, %r14, 31;
@%p14 bra BB7_38;
- setp.lt.u32 %p15, %r14, 64;
+ setp.lt.u32 %p15, %r13, 64;
@%p15 bra BB7_28;
ld.volatile.shared.f64 %fd37, [%r9+256];
@@ -1344,7 +1522,7 @@ BB7_25:
st.volatile.shared.f64 [%r9], %fd44;
BB7_28:
- setp.lt.u32 %p16, %r14, 32;
+ setp.lt.u32 %p16, %r13, 32;
@%p16 bra BB7_30;
ld.volatile.shared.f64 %fd38, [%r9+128];
@@ -1352,7 +1530,7 @@ BB7_28:
st.volatile.shared.f64 [%r9], %fd44;
BB7_30:
- setp.lt.u32 %p17, %r14, 16;
+ setp.lt.u32 %p17, %r13, 16;
@%p17 bra BB7_32;
ld.volatile.shared.f64 %fd39, [%r9+64];
@@ -1360,7 +1538,7 @@ BB7_30:
st.volatile.shared.f64 [%r9], %fd44;
BB7_32:
- setp.lt.u32 %p18, %r14, 8;
+ setp.lt.u32 %p18, %r13, 8;
@%p18 bra BB7_34;
ld.volatile.shared.f64 %fd40, [%r9+32];
@@ -1368,7 +1546,7 @@ BB7_32:
st.volatile.shared.f64 [%r9], %fd44;
BB7_34:
- setp.lt.u32 %p19, %r14, 4;
+ setp.lt.u32 %p19, %r13, 4;
@%p19 bra BB7_36;
ld.volatile.shared.f64 %fd41, [%r9+16];
@@ -1376,7 +1554,7 @@ BB7_34:
st.volatile.shared.f64 [%r9], %fd44;
BB7_36:
- setp.lt.u32 %p20, %r14, 2;
+ setp.lt.u32 %p20, %r13, 2;
@%p20 bra BB7_38;
ld.volatile.shared.f64 %fd42, [%r9+8];
@@ -1384,24 +1562,57 @@ BB7_36:
st.volatile.shared.f64 [%r9], %fd43;
BB7_38:
- setp.ne.s32 %p21, %r11, 0;
- @%p21 bra BB7_42;
+ setp.ne.s32 %p21, %r14, 0;
+ @%p21 bra BB7_43;
ld.shared.f64 %fd28, [memory];
- cvta.to.global.u64 %rd25, %rd11;
- ld.global.u64 %rd26, [%rd25+16];
- ld.global.u64 %rd27, [%rd25+32];
- cvta.to.global.u64 %rd33, %rd27;
- setp.ne.s64 %p22, %rd26, 0;
+ cvta.to.global.u64 %rd23, %rd12;
+ add.s64 %rd7, %rd23, 16;
+ ld.global.u64 %rd24, [%rd23+16];
+ setp.eq.s64 %p22, %rd24, 0;
@%p22 bra BB7_41;
- mul.wide.u32 %rd28, %r12, 8;
- add.s64 %rd33, %rd33, %rd28;
+ mov.u32 %r42, 0;
+ add.u64 %rd25, %SP, 0;
+ add.u64 %rd26, %SPL, 0;
+ st.local.u32 [%rd26], %r42;
+ st.local.u32 [%rd26+4], %r11;
+ mov.u64 %rd27, $str;
+ cvta.global.u64 %rd28, %rd27;
+ // Callseq Start 5
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.b64 [param0+0], %rd28;
+ .param .b64 param1;
+ st.param.b64 [param1+0], %rd25;
+ .param .b32 retval0;
+ call.uni (retval0),
+ vprintf,
+ (
+ param0,
+ param1
+ );
+ ld.param.b32 %r44, [retval0+0];
+
+ //{
+ }// Callseq End 5
+ // inline asm
+ trap;
+ // inline asm
+ ld.global.u64 %rd35, [%rd7+16];
+ bra.uni BB7_42;
BB7_41:
- st.global.f64 [%rd33], %fd28;
+ ld.global.u64 %rd29, [%rd7+16];
+ mul.wide.u32 %rd30, %r11, 8;
+ add.s64 %rd35, %rd29, %rd30;
BB7_42:
+ st.f64 [%rd35], %fd28;
+
+BB7_43:
ret;
}