You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:34 UTC

[37/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
new file mode 100644
index 0000000..198ac31
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
@@ -0,0 +1,858 @@
+#ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
+
+/* =========================================================================
+  Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/fft_operations.hpp
+    @brief Implementations of Fast Furier Transformation using cuda
+*/
+#include <cmath>
+#include <viennacl/matrix.hpp>
+#include <viennacl/vector.hpp>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+#include "viennacl/linalg/host_based/vector_operations.hpp"
+#include "viennacl/linalg/host_based/fft_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+namespace detail
+{
+  namespace fft
+  {
+    const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
+
+    inline vcl_size_t num_bits(vcl_size_t size)
+    {
+      vcl_size_t bits_datasize = 0;
+      vcl_size_t ds = 1;
+
+      while (ds < size)
+      {
+        ds = ds << 1;
+        bits_datasize++;
+      }
+
+      return bits_datasize;
+    }
+
+    inline vcl_size_t next_power_2(vcl_size_t n)
+    {
+      n = n - 1;
+
+      vcl_size_t power = 1;
+
+      while (power < sizeof(vcl_size_t) * 8)
+      {
+        n = n | (n >> power);
+        power *= 2;
+      }
+
+      return n + 1;
+    }
+
+  } //namespace fft
+} //namespace detail
+
+// addition
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+  return make_float2(a.x + b.x, a.y + b.y);
+}
+
+// subtract
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+  return make_float2(a.x - b.x, a.y - b.y);
+}
+// division
+template<typename SCALARTYPE>
+inline __device__ float2 operator/(float2 a,SCALARTYPE b)
+{
+  return make_float2(a.x/b, a.y/b);
+}
+
+//multiplication
+inline __device__ float2 operator*(float2 in1, float2 in2)
+{
+  return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
+}
+
+// addition
+inline __host__ __device__ double2 operator+(double2 a, double2 b)
+{
+  return make_double2(a.x + b.x, a.y + b.y);
+}
+
+// subtraction
+inline __host__ __device__ double2 operator-(double2 a, double2 b)
+{
+  return make_double2(a.x - b.x, a.y - b.y);
+}
+
+// division
+template<typename SCALARTYPE>
+inline __host__ __device__ double2 operator/(double2 a,SCALARTYPE b)
+{
+  return make_double2(a.x/b, a.y/b);
+}
+
+//multiplication
+inline __host__ __device__ double2 operator*(double2 in1, double2 in2)
+{
+  return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
+}
+
+inline __device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size)
+{
+  v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+  v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+  v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+  v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+  v = (v >> 16) | (v << 16);
+  v = v >> (32 - bit_size);
+  return v;
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_direct(
+    const Numeric2T * input,
+    Numeric2T * output,
+    unsigned int size,
+    unsigned int stride,
+    unsigned int batch_num,
+    NumericT sign,
+    bool is_row_major)
+{
+
+  const NumericT NUM_PI(3.14159265358979323846);
+
+  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
+  {
+    for (unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k += gridDim.x * blockDim.x)
+    {
+      Numeric2T f;
+      f.x = 0;
+      f.y = 0;
+
+      for (unsigned int n = 0; n < size; n++)
+      {
+        Numeric2T in;
+        if (!is_row_major)
+          in = input[batch_id * stride + n];   //input index here
+        else
+          in = input[n * stride + batch_id];//input index here
+
+        NumericT sn,cs;
+        NumericT arg = sign * 2 * NUM_PI * k / size * n;
+        sn = sin(arg);
+        cs = cos(arg);
+
+        Numeric2T ex;
+        ex.x = cs;
+        ex.y = sn;
+        Numeric2T tmp;
+        tmp.x = in.x * ex.x - in.y * ex.y;
+        tmp.y = in.x * ex.y + in.y * ex.x;
+        f = f + tmp;
+      }
+
+      if (!is_row_major)
+        output[batch_id * stride + k] = f; // output index here
+      else
+        output[k * stride + batch_id] = f;// output index here
+    }
+  }
+}
+
+/**
+ * @brief Direct 1D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::vector<NumericT, AlignmentV> const & in,
+            viennacl::vector<NumericT, AlignmentV>       & out,
+            vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
+            NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
+                          reinterpret_cast<      numeric2_type *>(viennacl::cuda_arg(out)),
+                          static_cast<unsigned int>(size),
+                          static_cast<unsigned int>(stride),
+                          static_cast<unsigned int>(batch_num),
+                          sign,
+                          bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
+}
+
+/**
+ * @brief Direct 2D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in,
+            viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>       & out,
+            vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
+            NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
+                          reinterpret_cast<      numeric2_type *>(viennacl::cuda_arg(out)),
+                          static_cast<unsigned int>(size),
+                          static_cast<unsigned int>(stride),
+                          static_cast<unsigned int>(batch_num),
+                          sign,
+                          bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
+}
+
+template<typename NumericT>
+__global__ void fft_reorder(NumericT * input,
+                            unsigned int bit_size,
+                            unsigned int size,
+                            unsigned int stride,
+                            unsigned int batch_num,
+                            bool is_row_major)
+{
+
+  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int glb_sz = gridDim.x * blockDim.x;
+
+  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
+  {
+    for (unsigned int i = glb_id; i < size; i += glb_sz)
+    {
+      unsigned int v = get_reorder_num(i, bit_size);
+
+      if (i < v)
+      {
+        if (!is_row_major)
+        {
+          NumericT tmp = input[batch_id * stride + i];    // index
+          input[batch_id * stride + i] = input[batch_id * stride + v];//index
+          input[batch_id * stride + v] = tmp;//index
+        }
+        else
+        {
+          NumericT tmp = input[i * stride + batch_id];
+          input[i * stride + batch_id] = input[v * stride + batch_id];
+          input[v * stride + batch_id] = tmp;
+        }
+      }
+    }
+  }
+}
+
+/***
+ * This function performs reorder of input data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void reorder(viennacl::vector<NumericT, AlignmentV> & in,
+             vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num,
+             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                           static_cast<unsigned int>(bits_datasize),
+                           static_cast<unsigned int>(size),
+                           static_cast<unsigned int>(stride),
+                           static_cast<unsigned int>(batch_num),
+                           static_cast<bool>(data_order));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_radix2_local(Numeric2T * input,
+                                 unsigned int bit_size,
+                                 unsigned int size,
+                                 unsigned int stride,
+                                 unsigned int batch_num,
+                                 NumericT sign,
+                                 bool is_row_major)
+{
+  __shared__ Numeric2T lcl_input[1024];
+  unsigned int grp_id = blockIdx.x;
+  unsigned int grp_num = gridDim.x;
+
+  unsigned int lcl_sz = blockDim.x;
+  unsigned int lcl_id = threadIdx.x;
+  const NumericT NUM_PI(3.14159265358979323846);
+
+  for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num)
+  {
+    for (unsigned int p = lcl_id; p < size; p += lcl_sz)
+    {
+      unsigned int v = get_reorder_num(p, bit_size);
+      if (!is_row_major)
+        lcl_input[v] = input[batch_id * stride + p];
+      else
+        lcl_input[v] = input[p * stride + batch_id];
+    }
+
+    __syncthreads();
+
+    //performs Cooley-Tukey FFT on local arrayfft
+    for (unsigned int s = 0; s < bit_size; s++)
+    {
+      unsigned int ss = 1 << s;
+      NumericT cs, sn;
+      for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz)
+      {
+        unsigned int group = (tid & (ss - 1));
+        unsigned int pos = ((tid >> s) << (s + 1)) + group;
+
+        Numeric2T in1 = lcl_input[pos];
+        Numeric2T in2 = lcl_input[pos + ss];
+
+        NumericT arg = group * sign * NUM_PI / ss;
+
+        sn = sin(arg);
+        cs = cos(arg);
+        Numeric2T ex;
+        ex.x = cs;
+        ex.y = sn;
+
+        Numeric2T tmp;
+        tmp.x = in2.x * ex.x - in2.y * ex.y;
+        tmp.y = in2.x * ex.y + in2.y * ex.x;
+
+        lcl_input[pos + ss] = in1 - tmp;
+        lcl_input[pos]      = in1 + tmp;
+      }
+      __syncthreads();
+    }
+
+    //copy local array back to global memory
+    for (unsigned int p = lcl_id; p < size; p += lcl_sz)
+    {
+      if (!is_row_major)
+        input[batch_id * stride + p] = lcl_input[p];   //index
+      else
+        input[p * stride + batch_id] = lcl_input[p];
+    }
+
+  }
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_radix2(Numeric2T * input,
+                           unsigned int s,
+                           unsigned int bit_size,
+                           unsigned int size,
+                           unsigned int stride,
+                           unsigned int batch_num,
+                           NumericT sign,
+                           bool is_row_major)
+{
+
+  unsigned int ss = 1 << s;
+  unsigned int half_size = size >> 1;
+
+  NumericT cs, sn;
+  const NumericT NUM_PI(3.14159265358979323846);
+
+  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int glb_sz = gridDim.x * blockDim.x;
+
+  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
+  {
+    for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz)
+    {
+      unsigned int group = (tid & (ss - 1));
+      unsigned int pos = ((tid >> s) << (s + 1)) + group;
+      Numeric2T in1;
+      Numeric2T in2;
+      unsigned int offset;
+      if (!is_row_major)
+      {
+        offset = batch_id * stride + pos;
+        in1 = input[offset];   //index
+        in2 = input[offset + ss];//index
+      }
+      else
+      {
+        offset = pos * stride + batch_id;
+        in1 = input[offset];   //index
+        in2 = input[offset + ss * stride];//index
+      }
+
+      NumericT arg = group * sign * NUM_PI / ss;
+
+      sn = sin(arg);
+      cs = cos(arg);
+
+      Numeric2T ex;
+      ex.x = cs;
+      ex.y = sn;
+
+      Numeric2T tmp;
+      tmp.x = in2.x * ex.x - in2.y * ex.y;
+      tmp.y = in2.x * ex.y + in2.y * ex.x;
+
+      if (!is_row_major)
+        input[offset + ss] = in1 - tmp;  //index
+      else
+        input[offset + ss * stride] = in1 - tmp;  //index
+      input[offset] = in1 + tmp;  //index
+    }
+  }
+}
+
+/**
+ * @brief Radix-2 1D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::vector<NumericT, AlignmentV> & in,
+            vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
+
+  if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
+  {
+    fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                                  static_cast<unsigned int>(bit_size),
+                                  static_cast<unsigned int>(size),
+                                  static_cast<unsigned int>(stride),
+                                  static_cast<unsigned int>(batch_num),
+                                  static_cast<NumericT>(sign),
+                                  static_cast<bool>(data_order));
+    VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
+  }
+  else
+  {
+    fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                             static_cast<unsigned int>(bit_size),
+                             static_cast<unsigned int>(size),
+                             static_cast<unsigned int>(stride),
+                             static_cast<unsigned int>(batch_num),
+                             static_cast<bool>(data_order));
+    VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
+
+    for (vcl_size_t step = 0; step < bit_size; step++)
+    {
+      fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                              static_cast<unsigned int>(step),
+                              static_cast<unsigned int>(bit_size),
+                              static_cast<unsigned int>(size),
+                              static_cast<unsigned int>(stride),
+                              static_cast<unsigned int>(batch_num),
+                              sign,
+                              static_cast<bool>(data_order));
+      VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
+    }
+  }
+}
+
+/**
+ * @brief Radix-2 2D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in,
+            vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
+
+  if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
+  {
+    fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                                  static_cast<unsigned int>(bit_size),
+                                  static_cast<unsigned int>(size),
+                                  static_cast<unsigned int>(stride),
+                                  static_cast<unsigned int>(batch_num),
+                                  sign,
+                                  static_cast<bool>(data_order));
+    VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
+  }
+  else
+  {
+    fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                             static_cast<unsigned int>(bit_size),
+                             static_cast<unsigned int>(size),
+                             static_cast<unsigned int>(stride),
+                             static_cast<unsigned int>(batch_num),
+                             static_cast<bool>(data_order));
+    VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
+    for (vcl_size_t step = 0; step < bit_size; step++)
+    {
+      fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                              static_cast<unsigned int>(step),
+                              static_cast<unsigned int>(bit_size),
+                              static_cast<unsigned int>(size),
+                              static_cast<unsigned int>(stride),
+                              static_cast<unsigned int>(batch_num),
+                              sign,
+                              static_cast<bool>(data_order));
+      VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
+    }
+  }
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void bluestein_post(Numeric2T * Z, Numeric2T * out, unsigned int size, NumericT sign)
+{
+  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int glb_sz =gridDim.x * blockDim.x;
+
+  unsigned int double_size = size << 1;
+  NumericT sn_a, cs_a;
+  const NumericT NUM_PI(3.14159265358979323846);
+
+  for (unsigned int i = glb_id; i < size; i += glb_sz)
+  {
+    unsigned int rm = i * i % (double_size);
+    NumericT angle = (NumericT)rm / size * (-NUM_PI);
+
+    sn_a = sin(angle);
+    cs_a= cos(angle);
+
+    Numeric2T b_i;
+    b_i.x = cs_a;
+    b_i.y = sn_a;
+    out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y;
+    out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x;
+  }
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B,
+                              unsigned int size, unsigned int ext_size, NumericT sign)
+{
+  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int glb_sz = gridDim.x * blockDim.x;
+
+  unsigned int double_size = size << 1;
+
+  NumericT sn_a, cs_a;
+  const NumericT NUM_PI(3.14159265358979323846);
+
+  for (unsigned int i = glb_id; i < size; i += glb_sz)
+  {
+    unsigned int rm = i * i % (double_size);
+    NumericT angle = (NumericT)rm / size * NUM_PI;
+
+    sn_a = sin(-angle);
+    cs_a= cos(-angle);
+
+    Numeric2T a_i;
+    a_i.x =cs_a;
+    a_i.y =sn_a;
+
+    Numeric2T b_i;
+    b_i.x =cs_a;
+    b_i.y =-sn_a;
+
+    A[i].x = input[i].x * a_i.x - input[i].y * a_i.y;
+    A[i].y = input[i].x * a_i.y + input[i].y * a_i.x;
+    B[i] = b_i;
+
+    // very bad instruction, to be fixed
+    if (i)
+    B[ext_size - i] = b_i;
+  }
+}
+
+template<typename NumericT>
+__global__ void zero2(NumericT * input1, NumericT * input2, unsigned int size)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+  {
+    input1[i].x = 0;
+    input1[i].y = 0;
+
+    input2[i].x = 0;
+    input2[i].y = 0;
+  }
+}
+
+/**
+ * @brief Bluestein's algorithm for computing Fourier transformation.
+ *
+ * Currently,  Works only for sizes of input data which less than 2^16.
+ * Uses a lot of additional memory, but should be fast for any size of data.
+ * Serial implementation has something about o(n * lg n) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void bluestein(viennacl::vector<NumericT, AlignmentV> & in,
+               viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t /*batch_num*/)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  vcl_size_t size = in.size() >> 1;
+  vcl_size_t ext_size = viennacl::linalg::cuda::detail::fft::next_power_2(2 * size - 1);
+
+  viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
+  viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
+  viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
+
+  zero2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
+                     reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
+                     static_cast<unsigned int>(ext_size));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("zero2");
+
+  bluestein_pre<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                             reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
+                             reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
+                             static_cast<unsigned int>(size),
+                             static_cast<unsigned int>(ext_size),
+                             NumericT(1));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_pre");
+
+  viennacl::linalg::convolve_i(A, B, Z);
+
+  bluestein_post<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(Z)),
+                              reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
+                              static_cast<unsigned int>(size),
+                              NumericT(1));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_post");
+}
+
+template<typename NumericT>
+__global__ void fft_mult_vec(const NumericT * input1,
+                             const NumericT * input2,
+                             NumericT * output,
+                             unsigned int size)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+  {
+    NumericT in1 = input1[i];
+    NumericT in2 = input2[i];
+    output[i] = in1 * in2;
+  }
+}
+
+/**
+ * @brief Mutiply two complex vectors and store result in output
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
+                      viennacl::vector<NumericT, AlignmentV> const & input2,
+                      viennacl::vector<NumericT, AlignmentV> & output)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  vcl_size_t size = input1.size() / 2;
+
+  fft_mult_vec<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input1)),
+                            reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input2)),
+                            reinterpret_cast<      numeric2_type *>(viennacl::cuda_arg(output)),
+                            static_cast<unsigned int>(size));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_mult_vec");
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_div_vec_scalar(Numeric2T * input1, unsigned int size, NumericT factor)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x*blockDim.x)
+    input1[i] = input1[i]/factor;
+}
+
+/**
+ * @brief Normalize vector on with his own size
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void normalize(viennacl::vector<NumericT, AlignmentV> & input)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  vcl_size_t size = input.size() >> 1;
+  NumericT norm_factor = static_cast<NumericT>(size);
+  fft_div_vec_scalar<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
+                                  static_cast<unsigned int>(size),
+                                  norm_factor);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_div_vec_scalar");
+}
+
+template<typename NumericT>
+__global__ void transpose(const NumericT * input,
+                          NumericT * output,
+                          unsigned int row_num,
+                          unsigned int col_num)
+{
+  unsigned int size = row_num * col_num;
+  for (unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
+  {
+    unsigned int row = i / col_num;
+    unsigned int col = i - row*col_num;
+    unsigned int new_pos = col * row_num + row;
+    output[new_pos] = input[i];
+  }
+}
+
+/**
+ * @brief Transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
+               viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  transpose<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input)),
+                         reinterpret_cast<      numeric2_type *>(viennacl::cuda_arg(output)),
+                         static_cast<unsigned int>(input.internal_size1()>>1),
+                         static_cast<unsigned int>(input.internal_size2()>>1));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("transpose");
+
+}
+
+template<typename NumericT>
+__global__ void transpose_inplace(
+    NumericT * input,
+    unsigned int row_num,
+    unsigned int col_num)
+{
+  unsigned int size = row_num * col_num;
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
+  {
+    unsigned int row = i / col_num;
+    unsigned int col = i - row*col_num;
+    unsigned int new_pos = col * row_num + row;
+    if (i < new_pos)
+    {
+      NumericT val = input[i];
+      input[i] = input[new_pos];
+      input[new_pos] = val;
+    }
+  }
+}
+
+/**
+ * @brief Inplace_transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  transpose_inplace<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
+                                 static_cast<unsigned int>(input.internal_size1()>>1),
+                                 static_cast<unsigned int>(input.internal_size2() >> 1));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("transpose_inplace");
+
+}
+
+template<typename RealT,typename ComplexT>
+__global__ void real_to_complex(const RealT * in, ComplexT * out, unsigned int size)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+  {
+    ComplexT val;
+    val.x = in[i];
+    val.y = 0;
+    out[i] = val;
+  }
+}
+
+/**
+ * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void real_to_complex(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT> & out, vcl_size_t size)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  real_to_complex<<<128,128>>>(viennacl::cuda_arg(in),
+                               reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
+                               static_cast<unsigned int>(size));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("real_to_complex");
+}
+
+template<typename ComplexT,typename RealT>
+__global__ void complex_to_real(const ComplexT * in, RealT * out, unsigned int size)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+    out[i] = in[i].x;
+}
+
+/**
+ * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void complex_to_real(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT>& out, vcl_size_t size)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  complex_to_real<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
+                               viennacl::cuda_arg(out),
+                               static_cast<unsigned int>(size));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("complex_to_real");
+
+}
+
+template<typename NumericT>
+__global__ void reverse_inplace(NumericT * vec, unsigned int size)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 1); i+=gridDim.x * blockDim.x)
+  {
+    NumericT val1 = vec[i];
+    NumericT val2 = vec[size - i - 1];
+    vec[i] = val2;
+    vec[size - i - 1] = val1;
+  }
+}
+
+/**
+ * @brief Reverse vector to oposite order and save it in input vector
+ */
+template<typename NumericT>
+void reverse(viennacl::vector_base<NumericT>& in)
+{
+  vcl_size_t size = in.size();
+  reverse_inplace<<<128,128>>>(viennacl::cuda_arg(in), static_cast<unsigned int>(size));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("reverse_inplace");
+}
+
+}  //namespace cuda
+}  //namespace linalg
+}  //namespace viennacl
+
+#endif /* FFT_OPERATIONS_HPP_ */

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
new file mode 100644
index 0000000..302a73c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
@@ -0,0 +1,666 @@
+#ifndef VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/ilu_operations.hpp
+    @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using CUDA
+*/
+
+#include <cmath>
+#include <algorithm>  //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+#include "viennacl/traits/stride.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+template<typename IndexT> // to control external linkage
+__global__ void extract_L_kernel_1(
+          const IndexT * A_row_indices,
+          const IndexT * A_col_indices,
+          unsigned int A_size1,
+          unsigned int * L_row_indices)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < A_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = A_row_indices[row];
+    unsigned int row_end   = A_row_indices[row+1];
+
+    unsigned int num_entries_L = 0;
+    for (unsigned int j=row_begin; j<row_end; ++j)
+    {
+      unsigned int col = A_col_indices[j];
+      if (col <= row)
+        ++num_entries_L;
+    }
+
+    L_row_indices[row] = num_entries_L;
+  }
+}
+
+template<typename NumericT>
+__global__ void extract_L_kernel_2(
+          unsigned int const *A_row_indices,
+          unsigned int const *A_col_indices,
+          NumericT     const *A_elements,
+          unsigned int A_size1,
+
+          unsigned int const *L_row_indices,
+          unsigned int       *L_col_indices,
+          NumericT           *L_elements)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < A_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = A_row_indices[row];
+    unsigned int row_end   = A_row_indices[row+1];
+
+    unsigned int index_L = L_row_indices[row];
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      unsigned int col = A_col_indices[j];
+      NumericT value = A_elements[j];
+
+      if (col <= row)
+      {
+        L_col_indices[index_L] = col;
+        L_elements[index_L]    = value;
+        ++index_L;
+      }
+    }
+  }
+}
+
+template<typename NumericT>
+void extract_L(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L)
+{
+  //
+  // Step 1: Count elements in L and U:
+  //
+  extract_L_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                   static_cast<unsigned int>(A.size1()),
+                                   viennacl::cuda_arg<unsigned int>(L.handle1())
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_1");
+
+  //
+  // Step 2: Exclusive scan on row_buffers:
+  //
+  viennacl::vector<unsigned int> wrapped_L_row_buffer(viennacl::cuda_arg<unsigned int>(L.handle1().cuda_handle()), viennacl::CUDA_MEMORY, A.size1() + 1);
+  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+  L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+  //
+  // Step 3: Write entries
+  //
+  extract_L_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                   viennacl::cuda_arg<NumericT>(A.handle()),
+                                   static_cast<unsigned int>(A.size1()),
+                                   viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(L.handle2()),
+                                   viennacl::cuda_arg<NumericT>(L.handle())
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_2");
+
+  L.generate_row_block_information();
+
+} // extract_L
+
+///////////////////////////////////////////////
+
+
+template<typename NumericT>
+__global__ void ilu_scale_kernel_1(
+          unsigned int const *A_row_indices,
+          unsigned int const *A_col_indices,
+          NumericT     const *A_elements,
+          unsigned int A_size1,
+
+          NumericT           *D_elements)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < A_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = A_row_indices[row];
+    unsigned int row_end   = A_row_indices[row+1];
+
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      unsigned int col = A_col_indices[j];
+      if (row == col)
+      {
+        D_elements[row] = NumericT(1) / sqrt(fabs(A_elements[j]));
+        break;
+      }
+    }
+  }
+}
+
+/** @brief Scales values in a matrix such that output = D * input * D, where D is a diagonal matrix (only the diagonal is provided) */
+template<typename NumericT>
+__global__ void ilu_scale_kernel_2(
+          unsigned int const *R_row_indices,
+          unsigned int const *R_col_indices,
+          NumericT           *R_elements,
+          unsigned int R_size1,
+
+          NumericT           *D_elements)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < R_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = R_row_indices[row];
+    unsigned int row_end   = R_row_indices[row+1];
+
+    NumericT D_row = D_elements[row];
+
+    for (unsigned int j = row_begin; j < row_end; ++j)
+      R_elements[j] *= D_row * D_elements[R_col_indices[j]];
+  }
+}
+
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void icc_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L)
+{
+  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+  // fill D:
+  ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                   viennacl::cuda_arg<NumericT>(A.handle()),
+                                   static_cast<unsigned int>(A.size1()),
+                                   viennacl::cuda_arg(D)
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
+
+  // scale L:
+  ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(L.handle2()),
+                                   viennacl::cuda_arg<NumericT>(L.handle()),
+                                   static_cast<unsigned int>(L.size1()),
+                                   viennacl::cuda_arg(D)
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
+}
+
+/////////////////////////////////////
+
+/** @brief CUDA kernel for one Chow-Patel-ICC sweep */
+template<typename NumericT>
+__global__ void icc_chow_patel_sweep_kernel(
+          unsigned int const *L_row_indices,
+          unsigned int const *L_col_indices,
+          NumericT           *L_elements,
+          NumericT     const *L_backup,
+          unsigned int L_size1,
+          NumericT     const *aij_L)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < L_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    //
+    // update L:
+    //
+    unsigned int row_Li_start = L_row_indices[row];
+    unsigned int row_Li_end   = L_row_indices[row + 1];
+
+    for (unsigned int i = row_Li_start; i < row_Li_end; ++i)
+    {
+      unsigned int col = L_col_indices[i];
+
+      unsigned int row_Lj_start = L_row_indices[col];
+      unsigned int row_Lj_end   = L_row_indices[col + 1];
+
+      // compute \sum_{k=1}^{j-1} l_ik u_kj
+      unsigned int index_Lj = row_Lj_start;
+      unsigned int col_Lj = L_col_indices[index_Lj];
+      NumericT s = aij_L[i];
+      for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li)
+      {
+        unsigned int col_Li = L_col_indices[index_Li];
+
+        // find element in U
+        while (col_Lj < col_Li)
+        {
+          ++index_Lj;
+          col_Lj = L_col_indices[index_Lj];
+        }
+
+        if (col_Lj == col_Li)
+          s -= L_backup[index_Li] * L_backup[index_Lj];
+      }
+
+      // update l_ij:
+      L_elements[i] = (row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]);  // diagonal element is last entry in U
+    }
+
+  }
+}
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void icc_chow_patel_sweep(compressed_matrix<NumericT>       & L,
+                          vector<NumericT>            const & aij_L)
+{
+  viennacl::backend::mem_handle L_backup;
+  viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+  viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+  icc_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                            viennacl::cuda_arg<unsigned int>(L.handle2()),
+                                            viennacl::cuda_arg<NumericT>(L.handle()),
+                                            viennacl::cuda_arg<NumericT>(L_backup),
+                                            static_cast<unsigned int>(L.size1()),
+
+                                            viennacl::cuda_arg<NumericT>(aij_L.handle())
+                                           );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("icc_chow_patel_sweep_kernel");
+
+}
+
+
+////////////////////////////// ILU ///////////////////////////
+
+template<typename IndexT> // to control external linkage
+__global__ void extract_LU_kernel_1(
+          const IndexT * A_row_indices,
+          const IndexT * A_col_indices,
+          unsigned int A_size1,
+
+          unsigned int * L_row_indices,
+
+          unsigned int * U_row_indices)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < A_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = A_row_indices[row];
+    unsigned int row_end   = A_row_indices[row+1];
+
+    unsigned int num_entries_L = 0;
+    unsigned int num_entries_U = 0;
+    for (unsigned int j=row_begin; j<row_end; ++j)
+    {
+      unsigned int col = A_col_indices[j];
+      if (col <= row)
+        ++num_entries_L;
+      if (col >= row)
+        ++num_entries_U;
+    }
+
+    L_row_indices[row] = num_entries_L;
+    U_row_indices[row] = num_entries_U;
+  }
+}
+
+template<typename NumericT>
+__global__ void extract_LU_kernel_2(
+          unsigned int const *A_row_indices,
+          unsigned int const *A_col_indices,
+          NumericT     const *A_elements,
+          unsigned int A_size1,
+
+          unsigned int const *L_row_indices,
+          unsigned int       *L_col_indices,
+          NumericT           *L_elements,
+
+          unsigned int const *U_row_indices,
+          unsigned int       *U_col_indices,
+          NumericT           *U_elements)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < A_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = A_row_indices[row];
+    unsigned int row_end   = A_row_indices[row+1];
+
+    unsigned int index_L = L_row_indices[row];
+    unsigned int index_U = U_row_indices[row];
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      unsigned int col = A_col_indices[j];
+      NumericT value = A_elements[j];
+
+      if (col <= row)
+      {
+        L_col_indices[index_L] = col;
+        L_elements[index_L]    = value;
+        ++index_L;
+      }
+
+      if (col >= row)
+      {
+        U_col_indices[index_U] = col;
+        U_elements[index_U]    = value;
+        ++index_U;
+      }
+    }
+  }
+}
+
+template<typename NumericT>
+void extract_LU(compressed_matrix<NumericT> const & A,
+                compressed_matrix<NumericT>       & L,
+                compressed_matrix<NumericT>       & U)
+{
+  //
+  // Step 1: Count elements in L and U:
+  //
+  extract_LU_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                    viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                    static_cast<unsigned int>(A.size1()),
+                                    viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                    viennacl::cuda_arg<unsigned int>(U.handle1())
+                                   );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_1");
+
+  //
+  // Step 2: Exclusive scan on row_buffers:
+  //
+  viennacl::vector<unsigned int> wrapped_L_row_buffer(viennacl::cuda_arg<unsigned int>(L.handle1()), viennacl::CUDA_MEMORY, A.size1() + 1);
+  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+  L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+  viennacl::vector<unsigned int> wrapped_U_row_buffer(viennacl::cuda_arg<unsigned int>(U.handle1()), viennacl::CUDA_MEMORY, A.size1() + 1);
+  viennacl::linalg::exclusive_scan(wrapped_U_row_buffer, wrapped_U_row_buffer);
+  U.reserve(wrapped_U_row_buffer[U.size1()], false);
+
+  //
+  // Step 3: Write entries
+  //
+  extract_LU_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                    viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                    viennacl::cuda_arg<NumericT>(A.handle()),
+                                    static_cast<unsigned int>(A.size1()),
+                                    viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                    viennacl::cuda_arg<unsigned int>(L.handle2()),
+                                    viennacl::cuda_arg<NumericT>(L.handle()),
+                                    viennacl::cuda_arg<unsigned int>(U.handle1()),
+                                    viennacl::cuda_arg<unsigned int>(U.handle2()),
+                                    viennacl::cuda_arg<NumericT>(U.handle())
+                                   );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_2");
+
+  L.generate_row_block_information();
+  // Note: block information for U will be generated after transposition
+
+} // extract_LU
+
+///////////////////////////////////////////////
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void ilu_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L,
+               compressed_matrix<NumericT>       & U)
+{
+  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+  // fill D:
+  ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                   viennacl::cuda_arg<NumericT>(A.handle()),
+                                   static_cast<unsigned int>(A.size1()),
+                                   viennacl::cuda_arg<NumericT>(D.handle())
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
+
+  // scale L:
+  ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(L.handle2()),
+                                   viennacl::cuda_arg<NumericT>(L.handle()),
+                                   static_cast<unsigned int>(L.size1()),
+                                   viennacl::cuda_arg<NumericT>(D.handle())
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2");
+
+  // scale U:
+  ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(U.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(U.handle2()),
+                                   viennacl::cuda_arg<NumericT>(U.handle()),
+                                   static_cast<unsigned int>(U.size1()),
+                                   viennacl::cuda_arg<NumericT>(D.handle())
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2");
+}
+
+/////////////////////////////////////
+
+/** @brief CUDA kernel for one Chow-Patel-ILU sweep */
+template<typename NumericT>
+__global__ void ilu_chow_patel_sweep_kernel(
+          unsigned int const *L_row_indices,
+          unsigned int const *L_col_indices,
+          NumericT           *L_elements,
+          NumericT     const *L_backup,
+          unsigned int L_size1,
+
+          NumericT     const *aij_L,
+
+          unsigned int const *U_trans_row_indices,
+          unsigned int const *U_trans_col_indices,
+          NumericT           *U_trans_elements,
+          NumericT     const *U_trans_backup,
+
+          NumericT     const *aij_U_trans)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < L_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    //
+    // update L:
+    //
+    unsigned int row_L_start = L_row_indices[row];
+    unsigned int row_L_end   = L_row_indices[row + 1];
+
+    for (unsigned int j = row_L_start; j < row_L_end; ++j)
+    {
+      unsigned int col = L_col_indices[j];
+
+      if (col == row)
+        continue;
+
+      unsigned int row_U_start = U_trans_row_indices[col];
+      unsigned int row_U_end   = U_trans_row_indices[col + 1];
+
+      // compute \sum_{k=1}^{j-1} l_ik u_kj
+      unsigned int index_U = row_U_start;
+      unsigned int col_U = (index_U < row_U_end) ? U_trans_col_indices[index_U] : L_size1;
+      NumericT sum = 0;
+      for (unsigned int k = row_L_start; k < j; ++k)
+      {
+        unsigned int col_L = L_col_indices[k];
+
+        // find element in U
+        while (col_U < col_L)
+        {
+          ++index_U;
+          col_U = U_trans_col_indices[index_U];
+        }
+
+        if (col_U == col_L)
+          sum += L_backup[k] * U_trans_backup[index_U];
+      }
+
+      // update l_ij:
+      L_elements[j] = (aij_L[j] - sum) / U_trans_backup[row_U_end - 1];  // diagonal element is last entry in U
+    }
+
+
+    //
+    // update U:
+    //
+    unsigned int row_U_start = U_trans_row_indices[row];
+    unsigned int row_U_end   = U_trans_row_indices[row + 1];
+    for (unsigned int j = row_U_start; j < row_U_end; ++j)
+    {
+      unsigned int col = U_trans_col_indices[j];
+
+      row_L_start = L_row_indices[col];
+      row_L_end   = L_row_indices[col + 1];
+
+      // compute \sum_{k=1}^{j-1} l_ik u_kj
+      unsigned int index_L = row_L_start;
+      unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : L_size1;
+      NumericT sum = 0;
+      for (unsigned int k = row_U_start; k < j; ++k)
+      {
+        unsigned int col_U = U_trans_col_indices[k];
+
+        // find element in L
+        while (col_L < col_U)
+        {
+          ++index_L;
+          col_L = L_col_indices[index_L];
+        }
+
+        if (col_U == col_L)
+          sum += L_backup[index_L] * U_trans_backup[k];
+      }
+
+      // update u_ij:
+      U_trans_elements[j] = aij_U_trans[j] - sum;
+    }
+  }
+}
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void ilu_chow_patel_sweep(compressed_matrix<NumericT>       & L,
+                          vector<NumericT>            const & aij_L,
+                          compressed_matrix<NumericT>       & U_trans,
+                          vector<NumericT>            const & aij_U_trans)
+{
+  viennacl::backend::mem_handle L_backup;
+  viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+  viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+  viennacl::backend::mem_handle U_backup;
+  viennacl::backend::memory_create(U_backup, U_trans.handle().raw_size(), viennacl::traits::context(U_trans));
+  viennacl::backend::memory_copy(U_trans.handle(), U_backup, 0, 0, U_trans.handle().raw_size());
+
+  ilu_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                            viennacl::cuda_arg<unsigned int>(L.handle2()),
+                                            viennacl::cuda_arg<NumericT>(L.handle()),
+                                            viennacl::cuda_arg<NumericT>(L_backup),
+                                            static_cast<unsigned int>(L.size1()),
+
+                                            viennacl::cuda_arg<NumericT>(aij_L.handle()),
+
+                                            viennacl::cuda_arg<unsigned int>(U_trans.handle1()),
+                                            viennacl::cuda_arg<unsigned int>(U_trans.handle2()),
+                                            viennacl::cuda_arg<NumericT>(U_trans.handle()),
+                                            viennacl::cuda_arg<NumericT>(U_backup),
+
+                                            viennacl::cuda_arg<NumericT>(aij_U_trans.handle())
+                                           );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_chow_patel_sweep_kernel");
+
+}
+
+//////////////////////////////////////
+
+template<typename NumericT>
+__global__ void ilu_form_neumann_matrix_kernel(
+          unsigned int const *R_row_indices,
+          unsigned int const *R_col_indices,
+          NumericT           *R_elements,
+          unsigned int R_size1,
+
+          NumericT           *D_elements)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < R_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = R_row_indices[row];
+    unsigned int row_end   = R_row_indices[row+1];
+
+    // part 1: extract diagonal entry
+    NumericT diag = 0;
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      unsigned int col = R_col_indices[j];
+      if (col == row)
+      {
+        diag = R_elements[j];
+        R_elements[j] = 0; // (I - D^{-1}R)
+        break;
+      }
+    }
+    D_elements[row] = diag;
+
+    // part2: scale
+    for (unsigned int j = row_begin; j < row_end; ++j)
+      R_elements[j] /= -diag;
+  }
+}
+
+
+
+template<typename NumericT>
+void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
+                             vector<NumericT> & diag_R)
+{
+  ilu_form_neumann_matrix_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(R.handle1()),
+                                               viennacl::cuda_arg<unsigned int>(R.handle2()),
+                                               viennacl::cuda_arg<NumericT>(R.handle()),
+                                               static_cast<unsigned int>(R.size1()),
+                                               viennacl::cuda_arg<NumericT>(diag_R.handle())
+                                              );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_form_neumann_matrix_kernel");
+}
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif