You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:34 UTC
[37/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
new file mode 100644
index 0000000..198ac31
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
@@ -0,0 +1,858 @@
+#ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/fft_operations.hpp
+ @brief Implementations of Fast Furier Transformation using cuda
+*/
+#include <cmath>
+#include <viennacl/matrix.hpp>
+#include <viennacl/vector.hpp>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+#include "viennacl/linalg/host_based/vector_operations.hpp"
+#include "viennacl/linalg/host_based/fft_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+namespace detail
+{
+ namespace fft
+ {
+ const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
+
+ inline vcl_size_t num_bits(vcl_size_t size)
+ {
+ vcl_size_t bits_datasize = 0;
+ vcl_size_t ds = 1;
+
+ while (ds < size)
+ {
+ ds = ds << 1;
+ bits_datasize++;
+ }
+
+ return bits_datasize;
+ }
+
+ inline vcl_size_t next_power_2(vcl_size_t n)
+ {
+ n = n - 1;
+
+ vcl_size_t power = 1;
+
+ while (power < sizeof(vcl_size_t) * 8)
+ {
+ n = n | (n >> power);
+ power *= 2;
+ }
+
+ return n + 1;
+ }
+
+ } //namespace fft
+} //namespace detail
+
+// addition
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+ return make_float2(a.x + b.x, a.y + b.y);
+}
+
+// subtract
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+ return make_float2(a.x - b.x, a.y - b.y);
+}
+// division
+template<typename SCALARTYPE>
+inline __device__ float2 operator/(float2 a,SCALARTYPE b)
+{
+ return make_float2(a.x/b, a.y/b);
+}
+
+//multiplication
+inline __device__ float2 operator*(float2 in1, float2 in2)
+{
+ return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
+}
+
+// addition
+inline __host__ __device__ double2 operator+(double2 a, double2 b)
+{
+ return make_double2(a.x + b.x, a.y + b.y);
+}
+
+// subtraction
+inline __host__ __device__ double2 operator-(double2 a, double2 b)
+{
+ return make_double2(a.x - b.x, a.y - b.y);
+}
+
+// division
+template<typename SCALARTYPE>
+inline __host__ __device__ double2 operator/(double2 a,SCALARTYPE b)
+{
+ return make_double2(a.x/b, a.y/b);
+}
+
+//multiplication
+inline __host__ __device__ double2 operator*(double2 in1, double2 in2)
+{
+ return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
+}
+
+inline __device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size)
+{
+ v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+ v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+ v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+ v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+ v = (v >> 16) | (v << 16);
+ v = v >> (32 - bit_size);
+ return v;
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_direct(
+ const Numeric2T * input,
+ Numeric2T * output,
+ unsigned int size,
+ unsigned int stride,
+ unsigned int batch_num,
+ NumericT sign,
+ bool is_row_major)
+{
+
+ const NumericT NUM_PI(3.14159265358979323846);
+
+ for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
+ {
+ for (unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k += gridDim.x * blockDim.x)
+ {
+ Numeric2T f;
+ f.x = 0;
+ f.y = 0;
+
+ for (unsigned int n = 0; n < size; n++)
+ {
+ Numeric2T in;
+ if (!is_row_major)
+ in = input[batch_id * stride + n]; //input index here
+ else
+ in = input[n * stride + batch_id];//input index here
+
+ NumericT sn,cs;
+ NumericT arg = sign * 2 * NUM_PI * k / size * n;
+ sn = sin(arg);
+ cs = cos(arg);
+
+ Numeric2T ex;
+ ex.x = cs;
+ ex.y = sn;
+ Numeric2T tmp;
+ tmp.x = in.x * ex.x - in.y * ex.y;
+ tmp.y = in.x * ex.y + in.y * ex.x;
+ f = f + tmp;
+ }
+
+ if (!is_row_major)
+ output[batch_id * stride + k] = f; // output index here
+ else
+ output[k * stride + batch_id] = f;// output index here
+ }
+ }
+}
+
+/**
+ * @brief Direct 1D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::vector<NumericT, AlignmentV> const & in,
+ viennacl::vector<NumericT, AlignmentV> & out,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
+ NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
+ reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(out)),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ sign,
+ bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
+}
+
+/**
+ * @brief Direct 2D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in,
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & out,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
+ NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
+ reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(out)),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ sign,
+ bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
+}
+
+template<typename NumericT>
+__global__ void fft_reorder(NumericT * input,
+ unsigned int bit_size,
+ unsigned int size,
+ unsigned int stride,
+ unsigned int batch_num,
+ bool is_row_major)
+{
+
+ unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int glb_sz = gridDim.x * blockDim.x;
+
+ for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
+ {
+ for (unsigned int i = glb_id; i < size; i += glb_sz)
+ {
+ unsigned int v = get_reorder_num(i, bit_size);
+
+ if (i < v)
+ {
+ if (!is_row_major)
+ {
+ NumericT tmp = input[batch_id * stride + i]; // index
+ input[batch_id * stride + i] = input[batch_id * stride + v];//index
+ input[batch_id * stride + v] = tmp;//index
+ }
+ else
+ {
+ NumericT tmp = input[i * stride + batch_id];
+ input[i * stride + batch_id] = input[v * stride + batch_id];
+ input[v * stride + batch_id] = tmp;
+ }
+ }
+ }
+ }
+}
+
+/***
+ * This function performs reorder of input data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void reorder(viennacl::vector<NumericT, AlignmentV> & in,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(bits_datasize),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_radix2_local(Numeric2T * input,
+ unsigned int bit_size,
+ unsigned int size,
+ unsigned int stride,
+ unsigned int batch_num,
+ NumericT sign,
+ bool is_row_major)
+{
+ __shared__ Numeric2T lcl_input[1024];
+ unsigned int grp_id = blockIdx.x;
+ unsigned int grp_num = gridDim.x;
+
+ unsigned int lcl_sz = blockDim.x;
+ unsigned int lcl_id = threadIdx.x;
+ const NumericT NUM_PI(3.14159265358979323846);
+
+ for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num)
+ {
+ for (unsigned int p = lcl_id; p < size; p += lcl_sz)
+ {
+ unsigned int v = get_reorder_num(p, bit_size);
+ if (!is_row_major)
+ lcl_input[v] = input[batch_id * stride + p];
+ else
+ lcl_input[v] = input[p * stride + batch_id];
+ }
+
+ __syncthreads();
+
+ //performs Cooley-Tukey FFT on local arrayfft
+ for (unsigned int s = 0; s < bit_size; s++)
+ {
+ unsigned int ss = 1 << s;
+ NumericT cs, sn;
+ for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz)
+ {
+ unsigned int group = (tid & (ss - 1));
+ unsigned int pos = ((tid >> s) << (s + 1)) + group;
+
+ Numeric2T in1 = lcl_input[pos];
+ Numeric2T in2 = lcl_input[pos + ss];
+
+ NumericT arg = group * sign * NUM_PI / ss;
+
+ sn = sin(arg);
+ cs = cos(arg);
+ Numeric2T ex;
+ ex.x = cs;
+ ex.y = sn;
+
+ Numeric2T tmp;
+ tmp.x = in2.x * ex.x - in2.y * ex.y;
+ tmp.y = in2.x * ex.y + in2.y * ex.x;
+
+ lcl_input[pos + ss] = in1 - tmp;
+ lcl_input[pos] = in1 + tmp;
+ }
+ __syncthreads();
+ }
+
+ //copy local array back to global memory
+ for (unsigned int p = lcl_id; p < size; p += lcl_sz)
+ {
+ if (!is_row_major)
+ input[batch_id * stride + p] = lcl_input[p]; //index
+ else
+ input[p * stride + batch_id] = lcl_input[p];
+ }
+
+ }
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_radix2(Numeric2T * input,
+ unsigned int s,
+ unsigned int bit_size,
+ unsigned int size,
+ unsigned int stride,
+ unsigned int batch_num,
+ NumericT sign,
+ bool is_row_major)
+{
+
+ unsigned int ss = 1 << s;
+ unsigned int half_size = size >> 1;
+
+ NumericT cs, sn;
+ const NumericT NUM_PI(3.14159265358979323846);
+
+ unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int glb_sz = gridDim.x * blockDim.x;
+
+ for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
+ {
+ for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz)
+ {
+ unsigned int group = (tid & (ss - 1));
+ unsigned int pos = ((tid >> s) << (s + 1)) + group;
+ Numeric2T in1;
+ Numeric2T in2;
+ unsigned int offset;
+ if (!is_row_major)
+ {
+ offset = batch_id * stride + pos;
+ in1 = input[offset]; //index
+ in2 = input[offset + ss];//index
+ }
+ else
+ {
+ offset = pos * stride + batch_id;
+ in1 = input[offset]; //index
+ in2 = input[offset + ss * stride];//index
+ }
+
+ NumericT arg = group * sign * NUM_PI / ss;
+
+ sn = sin(arg);
+ cs = cos(arg);
+
+ Numeric2T ex;
+ ex.x = cs;
+ ex.y = sn;
+
+ Numeric2T tmp;
+ tmp.x = in2.x * ex.x - in2.y * ex.y;
+ tmp.y = in2.x * ex.y + in2.y * ex.x;
+
+ if (!is_row_major)
+ input[offset + ss] = in1 - tmp; //index
+ else
+ input[offset + ss * stride] = in1 - tmp; //index
+ input[offset] = in1 + tmp; //index
+ }
+ }
+}
+
+/**
+ * @brief Radix-2 1D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::vector<NumericT, AlignmentV> & in,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
+
+ if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
+ {
+ fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(bit_size),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ static_cast<NumericT>(sign),
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
+ }
+ else
+ {
+ fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(bit_size),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
+
+ for (vcl_size_t step = 0; step < bit_size; step++)
+ {
+ fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(step),
+ static_cast<unsigned int>(bit_size),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ sign,
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
+ }
+ }
+}
+
+/**
+ * @brief Radix-2 2D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
+
+ if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
+ {
+ fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(bit_size),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ sign,
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
+ }
+ else
+ {
+ fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(bit_size),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
+ for (vcl_size_t step = 0; step < bit_size; step++)
+ {
+ fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(step),
+ static_cast<unsigned int>(bit_size),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ sign,
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
+ }
+ }
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void bluestein_post(Numeric2T * Z, Numeric2T * out, unsigned int size, NumericT sign)
+{
+ unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int glb_sz =gridDim.x * blockDim.x;
+
+ unsigned int double_size = size << 1;
+ NumericT sn_a, cs_a;
+ const NumericT NUM_PI(3.14159265358979323846);
+
+ for (unsigned int i = glb_id; i < size; i += glb_sz)
+ {
+ unsigned int rm = i * i % (double_size);
+ NumericT angle = (NumericT)rm / size * (-NUM_PI);
+
+ sn_a = sin(angle);
+ cs_a= cos(angle);
+
+ Numeric2T b_i;
+ b_i.x = cs_a;
+ b_i.y = sn_a;
+ out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y;
+ out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x;
+ }
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B,
+ unsigned int size, unsigned int ext_size, NumericT sign)
+{
+ unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int glb_sz = gridDim.x * blockDim.x;
+
+ unsigned int double_size = size << 1;
+
+ NumericT sn_a, cs_a;
+ const NumericT NUM_PI(3.14159265358979323846);
+
+ for (unsigned int i = glb_id; i < size; i += glb_sz)
+ {
+ unsigned int rm = i * i % (double_size);
+ NumericT angle = (NumericT)rm / size * NUM_PI;
+
+ sn_a = sin(-angle);
+ cs_a= cos(-angle);
+
+ Numeric2T a_i;
+ a_i.x =cs_a;
+ a_i.y =sn_a;
+
+ Numeric2T b_i;
+ b_i.x =cs_a;
+ b_i.y =-sn_a;
+
+ A[i].x = input[i].x * a_i.x - input[i].y * a_i.y;
+ A[i].y = input[i].x * a_i.y + input[i].y * a_i.x;
+ B[i] = b_i;
+
+ // very bad instruction, to be fixed
+ if (i)
+ B[ext_size - i] = b_i;
+ }
+}
+
+template<typename NumericT>
+__global__ void zero2(NumericT * input1, NumericT * input2, unsigned int size)
+{
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ {
+ input1[i].x = 0;
+ input1[i].y = 0;
+
+ input2[i].x = 0;
+ input2[i].y = 0;
+ }
+}
+
+/**
+ * @brief Bluestein's algorithm for computing Fourier transformation.
+ *
+ * Currently, Works only for sizes of input data which less than 2^16.
+ * Uses a lot of additional memory, but should be fast for any size of data.
+ * Serial implementation has something about o(n * lg n) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void bluestein(viennacl::vector<NumericT, AlignmentV> & in,
+ viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t /*batch_num*/)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ vcl_size_t size = in.size() >> 1;
+ vcl_size_t ext_size = viennacl::linalg::cuda::detail::fft::next_power_2(2 * size - 1);
+
+ viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
+ viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
+ viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
+
+ zero2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
+ reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
+ static_cast<unsigned int>(ext_size));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("zero2");
+
+ bluestein_pre<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
+ reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(ext_size),
+ NumericT(1));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_pre");
+
+ viennacl::linalg::convolve_i(A, B, Z);
+
+ bluestein_post<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(Z)),
+ reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
+ static_cast<unsigned int>(size),
+ NumericT(1));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_post");
+}
+
+template<typename NumericT>
+__global__ void fft_mult_vec(const NumericT * input1,
+ const NumericT * input2,
+ NumericT * output,
+ unsigned int size)
+{
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ {
+ NumericT in1 = input1[i];
+ NumericT in2 = input2[i];
+ output[i] = in1 * in2;
+ }
+}
+
+/**
+ * @brief Mutiply two complex vectors and store result in output
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
+ viennacl::vector<NumericT, AlignmentV> const & input2,
+ viennacl::vector<NumericT, AlignmentV> & output)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ vcl_size_t size = input1.size() / 2;
+
+ fft_mult_vec<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input1)),
+ reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input2)),
+ reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(output)),
+ static_cast<unsigned int>(size));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_mult_vec");
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_div_vec_scalar(Numeric2T * input1, unsigned int size, NumericT factor)
+{
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x*blockDim.x)
+ input1[i] = input1[i]/factor;
+}
+
+/**
+ * @brief Normalize vector on with his own size
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void normalize(viennacl::vector<NumericT, AlignmentV> & input)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ vcl_size_t size = input.size() >> 1;
+ NumericT norm_factor = static_cast<NumericT>(size);
+ fft_div_vec_scalar<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
+ static_cast<unsigned int>(size),
+ norm_factor);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_div_vec_scalar");
+}
+
+template<typename NumericT>
+__global__ void transpose(const NumericT * input,
+ NumericT * output,
+ unsigned int row_num,
+ unsigned int col_num)
+{
+ unsigned int size = row_num * col_num;
+ for (unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
+ {
+ unsigned int row = i / col_num;
+ unsigned int col = i - row*col_num;
+ unsigned int new_pos = col * row_num + row;
+ output[new_pos] = input[i];
+ }
+}
+
+/**
+ * @brief Transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ transpose<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input)),
+ reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(output)),
+ static_cast<unsigned int>(input.internal_size1()>>1),
+ static_cast<unsigned int>(input.internal_size2()>>1));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("transpose");
+
+}
+
+template<typename NumericT>
+__global__ void transpose_inplace(
+ NumericT * input,
+ unsigned int row_num,
+ unsigned int col_num)
+{
+ unsigned int size = row_num * col_num;
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
+ {
+ unsigned int row = i / col_num;
+ unsigned int col = i - row*col_num;
+ unsigned int new_pos = col * row_num + row;
+ if (i < new_pos)
+ {
+ NumericT val = input[i];
+ input[i] = input[new_pos];
+ input[new_pos] = val;
+ }
+ }
+}
+
+/**
+ * @brief Inplace_transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ transpose_inplace<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
+ static_cast<unsigned int>(input.internal_size1()>>1),
+ static_cast<unsigned int>(input.internal_size2() >> 1));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("transpose_inplace");
+
+}
+
+template<typename RealT,typename ComplexT>
+__global__ void real_to_complex(const RealT * in, ComplexT * out, unsigned int size)
+{
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ {
+ ComplexT val;
+ val.x = in[i];
+ val.y = 0;
+ out[i] = val;
+ }
+}
+
+/**
+ * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void real_to_complex(viennacl::vector_base<NumericT> const & in,
+ viennacl::vector_base<NumericT> & out, vcl_size_t size)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ real_to_complex<<<128,128>>>(viennacl::cuda_arg(in),
+ reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
+ static_cast<unsigned int>(size));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("real_to_complex");
+}
+
+template<typename ComplexT,typename RealT>
+__global__ void complex_to_real(const ComplexT * in, RealT * out, unsigned int size)
+{
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ out[i] = in[i].x;
+}
+
+/**
+ * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void complex_to_real(viennacl::vector_base<NumericT> const & in,
+ viennacl::vector_base<NumericT>& out, vcl_size_t size)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ complex_to_real<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
+ viennacl::cuda_arg(out),
+ static_cast<unsigned int>(size));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("complex_to_real");
+
+}
+
+template<typename NumericT>
+__global__ void reverse_inplace(NumericT * vec, unsigned int size)
+{
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 1); i+=gridDim.x * blockDim.x)
+ {
+ NumericT val1 = vec[i];
+ NumericT val2 = vec[size - i - 1];
+ vec[i] = val2;
+ vec[size - i - 1] = val1;
+ }
+}
+
+/**
+ * @brief Reverse vector to oposite order and save it in input vector
+ */
+template<typename NumericT>
+void reverse(viennacl::vector_base<NumericT>& in)
+{
+ vcl_size_t size = in.size();
+ reverse_inplace<<<128,128>>>(viennacl::cuda_arg(in), static_cast<unsigned int>(size));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("reverse_inplace");
+}
+
+} //namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* FFT_OPERATIONS_HPP_ */
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
new file mode 100644
index 0000000..302a73c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
@@ -0,0 +1,666 @@
+#ifndef VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/ilu_operations.hpp
+ @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using CUDA
+*/
+
+#include <cmath>
+#include <algorithm> //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+#include "viennacl/traits/stride.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+template<typename IndexT> // to control external linkage
+__global__ void extract_L_kernel_1(
+ const IndexT * A_row_indices,
+ const IndexT * A_col_indices,
+ unsigned int A_size1,
+ unsigned int * L_row_indices)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < A_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = A_row_indices[row];
+ unsigned int row_end = A_row_indices[row+1];
+
+ unsigned int num_entries_L = 0;
+ for (unsigned int j=row_begin; j<row_end; ++j)
+ {
+ unsigned int col = A_col_indices[j];
+ if (col <= row)
+ ++num_entries_L;
+ }
+
+ L_row_indices[row] = num_entries_L;
+ }
+}
+
+template<typename NumericT>
+__global__ void extract_L_kernel_2(
+ unsigned int const *A_row_indices,
+ unsigned int const *A_col_indices,
+ NumericT const *A_elements,
+ unsigned int A_size1,
+
+ unsigned int const *L_row_indices,
+ unsigned int *L_col_indices,
+ NumericT *L_elements)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < A_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = A_row_indices[row];
+ unsigned int row_end = A_row_indices[row+1];
+
+ unsigned int index_L = L_row_indices[row];
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ {
+ unsigned int col = A_col_indices[j];
+ NumericT value = A_elements[j];
+
+ if (col <= row)
+ {
+ L_col_indices[index_L] = col;
+ L_elements[index_L] = value;
+ ++index_L;
+ }
+ }
+ }
+}
+
+template<typename NumericT>
+void extract_L(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L)
+{
+ //
+ // Step 1: Count elements in L and U:
+ //
+ extract_L_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(L.handle1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_1");
+
+ //
+ // Step 2: Exclusive scan on row_buffers:
+ //
+ viennacl::vector<unsigned int> wrapped_L_row_buffer(viennacl::cuda_arg<unsigned int>(L.handle1().cuda_handle()), viennacl::CUDA_MEMORY, A.size1() + 1);
+ viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+ L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+ //
+ // Step 3: Write entries
+ //
+ extract_L_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(L.handle2()),
+ viennacl::cuda_arg<NumericT>(L.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_2");
+
+ L.generate_row_block_information();
+
+} // extract_L
+
+///////////////////////////////////////////////
+
+
+template<typename NumericT>
+__global__ void ilu_scale_kernel_1(
+ unsigned int const *A_row_indices,
+ unsigned int const *A_col_indices,
+ NumericT const *A_elements,
+ unsigned int A_size1,
+
+ NumericT *D_elements)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < A_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = A_row_indices[row];
+ unsigned int row_end = A_row_indices[row+1];
+
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ {
+ unsigned int col = A_col_indices[j];
+ if (row == col)
+ {
+ D_elements[row] = NumericT(1) / sqrt(fabs(A_elements[j]));
+ break;
+ }
+ }
+ }
+}
+
+/** @brief Scales values in a matrix such that output = D * input * D, where D is a diagonal matrix (only the diagonal is provided) */
+template<typename NumericT>
+__global__ void ilu_scale_kernel_2(
+ unsigned int const *R_row_indices,
+ unsigned int const *R_col_indices,
+ NumericT *R_elements,
+ unsigned int R_size1,
+
+ NumericT *D_elements)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < R_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = R_row_indices[row];
+ unsigned int row_end = R_row_indices[row+1];
+
+ NumericT D_row = D_elements[row];
+
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ R_elements[j] *= D_row * D_elements[R_col_indices[j]];
+ }
+}
+
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void icc_scale(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L)
+{
+ viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+ // fill D:
+ ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg(D)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
+
+ // scale L:
+ ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(L.handle2()),
+ viennacl::cuda_arg<NumericT>(L.handle()),
+ static_cast<unsigned int>(L.size1()),
+ viennacl::cuda_arg(D)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
+}
+
+/////////////////////////////////////
+
+/** @brief CUDA kernel for one Chow-Patel-ICC sweep */
+template<typename NumericT>
+__global__ void icc_chow_patel_sweep_kernel(
+ unsigned int const *L_row_indices,
+ unsigned int const *L_col_indices,
+ NumericT *L_elements,
+ NumericT const *L_backup,
+ unsigned int L_size1,
+ NumericT const *aij_L)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < L_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ //
+ // update L:
+ //
+ unsigned int row_Li_start = L_row_indices[row];
+ unsigned int row_Li_end = L_row_indices[row + 1];
+
+ for (unsigned int i = row_Li_start; i < row_Li_end; ++i)
+ {
+ unsigned int col = L_col_indices[i];
+
+ unsigned int row_Lj_start = L_row_indices[col];
+ unsigned int row_Lj_end = L_row_indices[col + 1];
+
+ // compute \sum_{k=1}^{j-1} l_ik u_kj
+ unsigned int index_Lj = row_Lj_start;
+ unsigned int col_Lj = L_col_indices[index_Lj];
+ NumericT s = aij_L[i];
+ for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li)
+ {
+ unsigned int col_Li = L_col_indices[index_Li];
+
+ // find element in U
+ while (col_Lj < col_Li)
+ {
+ ++index_Lj;
+ col_Lj = L_col_indices[index_Lj];
+ }
+
+ if (col_Lj == col_Li)
+ s -= L_backup[index_Li] * L_backup[index_Lj];
+ }
+
+ // update l_ij:
+ L_elements[i] = (row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]); // diagonal element is last entry in U
+ }
+
+ }
+}
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void icc_chow_patel_sweep(compressed_matrix<NumericT> & L,
+ vector<NumericT> const & aij_L)
+{
+ viennacl::backend::mem_handle L_backup;
+ viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+ viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+ icc_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(L.handle2()),
+ viennacl::cuda_arg<NumericT>(L.handle()),
+ viennacl::cuda_arg<NumericT>(L_backup),
+ static_cast<unsigned int>(L.size1()),
+
+ viennacl::cuda_arg<NumericT>(aij_L.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("icc_chow_patel_sweep_kernel");
+
+}
+
+
+////////////////////////////// ILU ///////////////////////////
+
+template<typename IndexT> // to control external linkage
+__global__ void extract_LU_kernel_1(
+ const IndexT * A_row_indices,
+ const IndexT * A_col_indices,
+ unsigned int A_size1,
+
+ unsigned int * L_row_indices,
+
+ unsigned int * U_row_indices)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < A_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = A_row_indices[row];
+ unsigned int row_end = A_row_indices[row+1];
+
+ unsigned int num_entries_L = 0;
+ unsigned int num_entries_U = 0;
+ for (unsigned int j=row_begin; j<row_end; ++j)
+ {
+ unsigned int col = A_col_indices[j];
+ if (col <= row)
+ ++num_entries_L;
+ if (col >= row)
+ ++num_entries_U;
+ }
+
+ L_row_indices[row] = num_entries_L;
+ U_row_indices[row] = num_entries_U;
+ }
+}
+
+template<typename NumericT>
+__global__ void extract_LU_kernel_2(
+ unsigned int const *A_row_indices,
+ unsigned int const *A_col_indices,
+ NumericT const *A_elements,
+ unsigned int A_size1,
+
+ unsigned int const *L_row_indices,
+ unsigned int *L_col_indices,
+ NumericT *L_elements,
+
+ unsigned int const *U_row_indices,
+ unsigned int *U_col_indices,
+ NumericT *U_elements)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < A_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = A_row_indices[row];
+ unsigned int row_end = A_row_indices[row+1];
+
+ unsigned int index_L = L_row_indices[row];
+ unsigned int index_U = U_row_indices[row];
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ {
+ unsigned int col = A_col_indices[j];
+ NumericT value = A_elements[j];
+
+ if (col <= row)
+ {
+ L_col_indices[index_L] = col;
+ L_elements[index_L] = value;
+ ++index_L;
+ }
+
+ if (col >= row)
+ {
+ U_col_indices[index_U] = col;
+ U_elements[index_U] = value;
+ ++index_U;
+ }
+ }
+ }
+}
+
+template<typename NumericT>
+void extract_LU(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L,
+ compressed_matrix<NumericT> & U)
+{
+ //
+ // Step 1: Count elements in L and U:
+ //
+ extract_LU_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(U.handle1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_1");
+
+ //
+ // Step 2: Exclusive scan on row_buffers:
+ //
+ viennacl::vector<unsigned int> wrapped_L_row_buffer(viennacl::cuda_arg<unsigned int>(L.handle1()), viennacl::CUDA_MEMORY, A.size1() + 1);
+ viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+ L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+ viennacl::vector<unsigned int> wrapped_U_row_buffer(viennacl::cuda_arg<unsigned int>(U.handle1()), viennacl::CUDA_MEMORY, A.size1() + 1);
+ viennacl::linalg::exclusive_scan(wrapped_U_row_buffer, wrapped_U_row_buffer);
+ U.reserve(wrapped_U_row_buffer[U.size1()], false);
+
+ //
+ // Step 3: Write entries
+ //
+ extract_LU_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(L.handle2()),
+ viennacl::cuda_arg<NumericT>(L.handle()),
+ viennacl::cuda_arg<unsigned int>(U.handle1()),
+ viennacl::cuda_arg<unsigned int>(U.handle2()),
+ viennacl::cuda_arg<NumericT>(U.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_2");
+
+ L.generate_row_block_information();
+ // Note: block information for U will be generated after transposition
+
+} // extract_LU
+
+///////////////////////////////////////////////
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void ilu_scale(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L,
+ compressed_matrix<NumericT> & U)
+{
+ viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+ // fill D:
+ ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<NumericT>(D.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
+
+ // scale L:
+ ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(L.handle2()),
+ viennacl::cuda_arg<NumericT>(L.handle()),
+ static_cast<unsigned int>(L.size1()),
+ viennacl::cuda_arg<NumericT>(D.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2");
+
+ // scale U:
+ ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(U.handle1()),
+ viennacl::cuda_arg<unsigned int>(U.handle2()),
+ viennacl::cuda_arg<NumericT>(U.handle()),
+ static_cast<unsigned int>(U.size1()),
+ viennacl::cuda_arg<NumericT>(D.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2");
+}
+
+/////////////////////////////////////
+
+/** @brief CUDA kernel for one Chow-Patel-ILU sweep */
+template<typename NumericT>
+__global__ void ilu_chow_patel_sweep_kernel(
+ unsigned int const *L_row_indices,
+ unsigned int const *L_col_indices,
+ NumericT *L_elements,
+ NumericT const *L_backup,
+ unsigned int L_size1,
+
+ NumericT const *aij_L,
+
+ unsigned int const *U_trans_row_indices,
+ unsigned int const *U_trans_col_indices,
+ NumericT *U_trans_elements,
+ NumericT const *U_trans_backup,
+
+ NumericT const *aij_U_trans)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < L_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ //
+ // update L:
+ //
+ unsigned int row_L_start = L_row_indices[row];
+ unsigned int row_L_end = L_row_indices[row + 1];
+
+ for (unsigned int j = row_L_start; j < row_L_end; ++j)
+ {
+ unsigned int col = L_col_indices[j];
+
+ if (col == row)
+ continue;
+
+ unsigned int row_U_start = U_trans_row_indices[col];
+ unsigned int row_U_end = U_trans_row_indices[col + 1];
+
+ // compute \sum_{k=1}^{j-1} l_ik u_kj
+ unsigned int index_U = row_U_start;
+ unsigned int col_U = (index_U < row_U_end) ? U_trans_col_indices[index_U] : L_size1;
+ NumericT sum = 0;
+ for (unsigned int k = row_L_start; k < j; ++k)
+ {
+ unsigned int col_L = L_col_indices[k];
+
+ // find element in U
+ while (col_U < col_L)
+ {
+ ++index_U;
+ col_U = U_trans_col_indices[index_U];
+ }
+
+ if (col_U == col_L)
+ sum += L_backup[k] * U_trans_backup[index_U];
+ }
+
+ // update l_ij:
+ L_elements[j] = (aij_L[j] - sum) / U_trans_backup[row_U_end - 1]; // diagonal element is last entry in U
+ }
+
+
+ //
+ // update U:
+ //
+ unsigned int row_U_start = U_trans_row_indices[row];
+ unsigned int row_U_end = U_trans_row_indices[row + 1];
+ for (unsigned int j = row_U_start; j < row_U_end; ++j)
+ {
+ unsigned int col = U_trans_col_indices[j];
+
+ row_L_start = L_row_indices[col];
+ row_L_end = L_row_indices[col + 1];
+
+ // compute \sum_{k=1}^{j-1} l_ik u_kj
+ unsigned int index_L = row_L_start;
+ unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : L_size1;
+ NumericT sum = 0;
+ for (unsigned int k = row_U_start; k < j; ++k)
+ {
+ unsigned int col_U = U_trans_col_indices[k];
+
+ // find element in L
+ while (col_L < col_U)
+ {
+ ++index_L;
+ col_L = L_col_indices[index_L];
+ }
+
+ if (col_U == col_L)
+ sum += L_backup[index_L] * U_trans_backup[k];
+ }
+
+ // update u_ij:
+ U_trans_elements[j] = aij_U_trans[j] - sum;
+ }
+ }
+}
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void ilu_chow_patel_sweep(compressed_matrix<NumericT> & L,
+ vector<NumericT> const & aij_L,
+ compressed_matrix<NumericT> & U_trans,
+ vector<NumericT> const & aij_U_trans)
+{
+ viennacl::backend::mem_handle L_backup;
+ viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+ viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+ viennacl::backend::mem_handle U_backup;
+ viennacl::backend::memory_create(U_backup, U_trans.handle().raw_size(), viennacl::traits::context(U_trans));
+ viennacl::backend::memory_copy(U_trans.handle(), U_backup, 0, 0, U_trans.handle().raw_size());
+
+ ilu_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(L.handle2()),
+ viennacl::cuda_arg<NumericT>(L.handle()),
+ viennacl::cuda_arg<NumericT>(L_backup),
+ static_cast<unsigned int>(L.size1()),
+
+ viennacl::cuda_arg<NumericT>(aij_L.handle()),
+
+ viennacl::cuda_arg<unsigned int>(U_trans.handle1()),
+ viennacl::cuda_arg<unsigned int>(U_trans.handle2()),
+ viennacl::cuda_arg<NumericT>(U_trans.handle()),
+ viennacl::cuda_arg<NumericT>(U_backup),
+
+ viennacl::cuda_arg<NumericT>(aij_U_trans.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_chow_patel_sweep_kernel");
+
+}
+
+//////////////////////////////////////
+
+template<typename NumericT>
+__global__ void ilu_form_neumann_matrix_kernel(
+ unsigned int const *R_row_indices,
+ unsigned int const *R_col_indices,
+ NumericT *R_elements,
+ unsigned int R_size1,
+
+ NumericT *D_elements)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < R_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = R_row_indices[row];
+ unsigned int row_end = R_row_indices[row+1];
+
+ // part 1: extract diagonal entry
+ NumericT diag = 0;
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ {
+ unsigned int col = R_col_indices[j];
+ if (col == row)
+ {
+ diag = R_elements[j];
+ R_elements[j] = 0; // (I - D^{-1}R)
+ break;
+ }
+ }
+ D_elements[row] = diag;
+
+ // part2: scale
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ R_elements[j] /= -diag;
+ }
+}
+
+
+
+template<typename NumericT>
+void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
+ vector<NumericT> & diag_R)
+{
+ ilu_form_neumann_matrix_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(R.handle1()),
+ viennacl::cuda_arg<unsigned int>(R.handle2()),
+ viennacl::cuda_arg<NumericT>(R.handle()),
+ static_cast<unsigned int>(R.size1()),
+ viennacl::cuda_arg<NumericT>(diag_R.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_form_neumann_matrix_kernel");
+}
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif