You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/10 16:52:27 UTC
[22/51] [partial] mahout git commit: Revert "(nojira) add native-viennaCL module to codebase. closes apache/mahout#241"

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
deleted file mode 100644
index 1d212c2..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
+++ /dev/null
@@ -1,307 +0,0 @@
-#ifndef VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP
-#define VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/linalg/host_based/direct_solve.hpp
-    @brief Implementations of dense direct triangular solvers are found here.
-*/
-
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-
-#include "viennacl/linalg/host_based/common.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace host_based
-{
-
-namespace detail
-{
-  //
-  // Upper solve:
-  //
-  template<typename MatrixT1, typename MatrixT2>
-  void upper_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal)
-  {
-    typedef typename MatrixT2::value_type   value_type;
-
-    for (vcl_size_t i = 0; i < A_size; ++i)
-    {
-      vcl_size_t current_row = A_size - i - 1;
-
-      for (vcl_size_t j = current_row + 1; j < A_size; ++j)
-      {
-        value_type A_element = A(current_row, j);
-        for (vcl_size_t k=0; k < B_size; ++k)
-          B(current_row, k) -= A_element * B(j, k);
-      }
-
-      if (!unit_diagonal)
-      {
-        value_type A_diag = A(current_row, current_row);
-        for (vcl_size_t k=0; k < B_size; ++k)
-          B(current_row, k) /= A_diag;
-      }
-    }
-  }
-
-  template<typename MatrixT1, typename MatrixT2>
-  void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_upper_tag)
-  {
-    upper_inplace_solve_matrix(A, B, A_size, B_size, true);
-  }
-
-  template<typename MatrixT1, typename MatrixT2>
-  void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::upper_tag)
-  {
-    upper_inplace_solve_matrix(A, B, A_size, B_size, false);
-  }
-
-  //
-  // Lower solve:
-  //
-  template<typename MatrixT1, typename MatrixT2>
-  void lower_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal)
-  {
-    typedef typename MatrixT2::value_type   value_type;
-
-    for (vcl_size_t i = 0; i < A_size; ++i)
-    {
-      for (vcl_size_t j = 0; j < i; ++j)
-      {
-        value_type A_element = A(i, j);
-        for (vcl_size_t k=0; k < B_size; ++k)
-          B(i, k) -= A_element * B(j, k);
-      }
-
-      if (!unit_diagonal)
-      {
-        value_type A_diag = A(i, i);
-        for (vcl_size_t k=0; k < B_size; ++k)
-          B(i, k) /= A_diag;
-      }
-    }
-  }
-
-  template<typename MatrixT1, typename MatrixT2>
-  void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_lower_tag)
-  {
-    lower_inplace_solve_matrix(A, B, A_size, B_size, true);
-  }
-
-  template<typename MatrixT1, typename MatrixT2>
-  void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::lower_tag)
-  {
-    lower_inplace_solve_matrix(A, B, A_size, B_size, false);
-  }
-
-}
-
-//
-// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
-//
-
-////////////////// upper triangular solver (upper_tag) //////////////////////////////////////
-/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B   (MATLAB notation)
-*
-* @param A        The system matrix
-* @param B        The matrix of row vectors, where the solution is directly written to
-*/
-template<typename NumericT, typename SolverTagT>
-void inplace_solve(matrix_base<NumericT> const & A,
-                   matrix_base<NumericT> & B,
-                   SolverTagT)
-{
-  typedef NumericT        value_type;
-
-  value_type const * data_A = detail::extract_raw_pointer<value_type>(A);
-  value_type       * data_B = detail::extract_raw_pointer<value_type>(B);
-
-  vcl_size_t A_start1 = viennacl::traits::start1(A);
-  vcl_size_t A_start2 = viennacl::traits::start2(A);
-  vcl_size_t A_inc1   = viennacl::traits::stride1(A);
-  vcl_size_t A_inc2   = viennacl::traits::stride2(A);
-  //vcl_size_t A_size1  = viennacl::traits::size1(A);
-  vcl_size_t A_size2  = viennacl::traits::size2(A);
-  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A);
-  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A);
-
-  vcl_size_t B_start1 = viennacl::traits::start1(B);
-  vcl_size_t B_start2 = viennacl::traits::start2(B);
-  vcl_size_t B_inc1   = viennacl::traits::stride1(B);
-  vcl_size_t B_inc2   = viennacl::traits::stride2(B);
-  //vcl_size_t B_size1  = viennacl::traits::size1(B);
-  vcl_size_t B_size2  = viennacl::traits::size2(B);
-  vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(B);
-  vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(B);
-
-
-  if (A.row_major() && B.row_major())
-  {
-    detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
-    detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
-
-    detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
-  }
-  else if (A.row_major() && !B.row_major())
-  {
-    detail::matrix_array_wrapper<value_type const, row_major,    false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
-    detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
-
-    detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
-  }
-  else if (!A.row_major() && B.row_major())
-  {
-    detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
-    detail::matrix_array_wrapper<value_type,       row_major,    false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
-
-    detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
-  }
-  else
-  {
-    detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
-    detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
-
-    detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
-  }
-}
-
-
-//
-//  Solve on vector
-//
-
-namespace detail
-{
-  //
-  // Upper solve:
-  //
-  template<typename MatrixT, typename VectorT>
-  void upper_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal)
-  {
-    typedef typename VectorT::value_type   value_type;
-
-    for (vcl_size_t i = 0; i < A_size; ++i)
-    {
-      vcl_size_t current_row = A_size - i - 1;
-
-      for (vcl_size_t j = current_row + 1; j < A_size; ++j)
-      {
-        value_type A_element = A(current_row, j);
-        b(current_row) -= A_element * b(j);
-      }
-
-      if (!unit_diagonal)
-        b(current_row) /= A(current_row, current_row);
-    }
-  }
-
-  template<typename MatrixT, typename VectorT>
-  void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_upper_tag)
-  {
-    upper_inplace_solve_vector(A, b, A_size, true);
-  }
-
-  template<typename MatrixT, typename VectorT>
-  void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::upper_tag)
-  {
-    upper_inplace_solve_vector(A, b, A_size, false);
-  }
-
-  //
-  // Lower solve:
-  //
-  template<typename MatrixT, typename VectorT>
-  void lower_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal)
-  {
-    typedef typename VectorT::value_type   value_type;
-
-    for (vcl_size_t i = 0; i < A_size; ++i)
-    {
-      for (vcl_size_t j = 0; j < i; ++j)
-      {
-        value_type A_element = A(i, j);
-        b(i) -= A_element * b(j);
-      }
-
-      if (!unit_diagonal)
-        b(i) /= A(i, i);
-    }
-  }
-
-  template<typename MatrixT, typename VectorT>
-  void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_lower_tag)
-  {
-    lower_inplace_solve_vector(A, b, A_size, true);
-  }
-
-  template<typename MatrixT, typename VectorT>
-  void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::lower_tag)
-  {
-    lower_inplace_solve_vector(A, b, A_size, false);
-  }
-
-}
-
-template<typename NumericT, typename SolverTagT>
-void inplace_solve(matrix_base<NumericT> const & mat,
-                   vector_base<NumericT> & vec,
-                   SolverTagT)
-{
-  typedef NumericT        value_type;
-
-  value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
-  value_type       * data_v = detail::extract_raw_pointer<value_type>(vec);
-
-  vcl_size_t A_start1 = viennacl::traits::start1(mat);
-  vcl_size_t A_start2 = viennacl::traits::start2(mat);
-  vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
-  vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
-  vcl_size_t A_size2  = viennacl::traits::size2(mat);
-  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
-  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
-
-  vcl_size_t start1 = viennacl::traits::start(vec);
-  vcl_size_t inc1   = viennacl::traits::stride(vec);
-
-  if (mat.row_major())
-  {
-    detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
-    detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1);
-
-    detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT());
-  }
-  else
-  {
-    detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
-    detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1);
-
-    detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT());
-  }
-}
-
-
-} // namespace host_based
-} // namespace linalg
-} // namespace viennacl
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
deleted file mode 100644
index f53f8f2..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
+++ /dev/null
@@ -1,856 +0,0 @@
-#ifndef VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_
-#define VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file  viennacl/linalg/host_based/fft_operations.hpp
-    @brief Implementations of Fast Furier Transformation using a plain single-threaded or OpenMP-enabled execution on CPU
- */
-
-//TODO openom Conditions
-#include <viennacl/vector.hpp>
-#include <viennacl/matrix.hpp>
-
-#include "viennacl/linalg/host_based/vector_operations.hpp"
-
-#include <stdexcept>
-#include <cmath>
-#include <complex>
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace host_based
-{
-namespace detail
-{
-  namespace fft
-  {
-    const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
-
-    namespace FFT_DATA_ORDER
-    {
-      enum DATA_ORDER
-      {
-        ROW_MAJOR, COL_MAJOR
-      };
-    }
-
-    inline vcl_size_t num_bits(vcl_size_t size)
-    {
-      vcl_size_t bits_datasize = 0;
-      vcl_size_t ds = 1;
-
-      while (ds < size)
-      {
-        ds = ds << 1;
-        bits_datasize++;
-      }
-
-      return bits_datasize;
-    }
-
-    inline vcl_size_t next_power_2(vcl_size_t n)
-    {
-      n = n - 1;
-
-      vcl_size_t power = 1;
-
-      while (power < sizeof(vcl_size_t) * 8)
-      {
-        n = n | (n >> power);
-        power *= 2;
-      }
-
-      return n + 1;
-    }
-
-    inline vcl_size_t get_reorder_num(vcl_size_t v, vcl_size_t bit_size)
-    {
-      v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
-      v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
-      v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
-      v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
-      v = (v >> 16) | (v << 16);
-      v = v >> (32 - bit_size);
-      return v;
-    }
-
-    template<typename NumericT, unsigned int AlignmentV>
-    void copy_to_complex_array(std::complex<NumericT> * input_complex,
-                               viennacl::vector<NumericT, AlignmentV> const & in, vcl_size_t size)
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i2 = 0; i2 < long(size * 2); i2 += 2)
-      { //change array to complex array
-        vcl_size_t i = vcl_size_t(i2);
-        input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
-      }
-    }
-
-    template<typename NumericT>
-    void copy_to_complex_array(std::complex<NumericT> * input_complex,
-                               viennacl::vector_base<NumericT> const & in, vcl_size_t size)
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i2 = 0; i2 < long(size * 2); i2 += 2)
-      { //change array to complex array
-        vcl_size_t i = vcl_size_t(i2);
-        input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
-      }
-    }
-
-    template<typename NumericT, unsigned int AlignmentV>
-    void copy_to_vector(std::complex<NumericT> * input_complex,
-                        viennacl::vector<NumericT, AlignmentV> & in, vcl_size_t size)
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i2 = 0; i2 < long(size); i2++)
-      {
-        vcl_size_t i = vcl_size_t(i2);
-        in(i * 2)     = static_cast<NumericT>(std::real(input_complex[i]));
-        in(i * 2 + 1) = static_cast<NumericT>(std::imag(input_complex[i]));
-      }
-    }
-
-    template<typename NumericT>
-    void copy_to_complex_array(std::complex<NumericT> * input_complex,
-                               NumericT const * in, vcl_size_t size)
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i2 = 0; i2 < long(size * 2); i2 += 2)
-      { //change array to complex array
-        vcl_size_t i = vcl_size_t(i2);
-        input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
-      }
-    }
-
-    template<typename NumericT>
-    void copy_to_vector(std::complex<NumericT> * input_complex, NumericT * in, vcl_size_t size)
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i2 = 0; i2 < long(size); i2++)
-      {
-        vcl_size_t i = vcl_size_t(i2);
-        in[i * 2]     = static_cast<NumericT>(std::real(input_complex[i]));
-        in[i * 2 + 1] = static_cast<NumericT>(std::imag(input_complex[i]));
-      }
-    }
-
-    template<typename NumericT>
-    void copy_to_vector(std::complex<NumericT> * input_complex,
-                        viennacl::vector_base<NumericT> & in, vcl_size_t size)
-    {
-      std::vector<NumericT> temp(2 * size);
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i2 = 0; i2 < long(size); i2++)
-      {
-        vcl_size_t i = vcl_size_t(i2);
-        temp[i * 2]     = static_cast<NumericT>(std::real(input_complex[i]));
-        temp[i * 2 + 1] = static_cast<NumericT>(std::imag(input_complex[i]));
-      }
-      viennacl::copy(temp, in);
-    }
-
-    template<typename NumericT>
-    void zero2(NumericT *input1, NumericT *input2, vcl_size_t size)
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i2 = 0; i2 < long(size); i2++)
-      {
-        vcl_size_t i = vcl_size_t(i2);
-        input1[i] = 0;
-        input2[i] = 0;
-      }
-    }
-
-  } //namespace fft
-
-} //namespace detail
-
-/**
- * @brief Direct algoritm kenrnel
- */
-template<typename NumericT>
-void fft_direct(std::complex<NumericT> * input_complex, std::complex<NumericT> * output,
-                vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign,
-                viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-  NumericT const NUM_PI = NumericT(3.14159265358979323846);
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel
-#endif
-  for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
-  {
-    vcl_size_t batch_id = vcl_size_t(batch_id2);
-    for (vcl_size_t k = 0; k < size; k += 1)
-    {
-      std::complex<NumericT> f = 0;
-      for (vcl_size_t n = 0; n < size; n++)
-      {
-        std::complex<NumericT> input;
-        if (!data_order)
-          input = input_complex[batch_id * stride + n]; //input index here
-        else
-          input = input_complex[n * stride + batch_id];
-        NumericT arg = sign * 2 * NUM_PI * NumericT(k) / NumericT(size * n);
-        NumericT sn  = std::sin(arg);
-        NumericT cs  = std::cos(arg);
-
-        std::complex<NumericT> ex(cs, sn);
-        std::complex<NumericT> tmp(input.real() * ex.real() - input.imag() * ex.imag(),
-                                   input.real() * ex.imag() + input.imag() * ex.real());
-        f = f + tmp;
-      }
-      if (!data_order)
-        output[batch_id * stride + k] = f;   // output index here
-      else
-        output[k * stride + batch_id] = f;
-    }
-  }
-
-}
-
-/**
- * @brief Direct 1D algorithm for computing Fourier transformation.
- *
- * Works on any sizes of data.
- * Serial implementation has o(n^2) complexity
- */
-template<typename NumericT, unsigned int AlignmentV>
-void direct(viennacl::vector<NumericT, AlignmentV> const & in,
-            viennacl::vector<NumericT, AlignmentV>       & out,
-            vcl_size_t size, vcl_size_t stride,
-            vcl_size_t batch_num, NumericT sign = NumericT(-1),
-            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-  std::vector<std::complex<NumericT> > input_complex(size * batch_num);
-  std::vector<std::complex<NumericT> > output(size * batch_num);
-
-  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
-
-  fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order);
-
-  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], out, size * batch_num);
-}
-
-/**
- * @brief Direct 2D algorithm for computing Fourier transformation.
- *
- * Works on any sizes of data.
- * Serial implementation has o(n^2) complexity
- */
-template<typename NumericT, unsigned int AlignmentV>
-void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in,
-            viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>       & out, vcl_size_t size,
-            vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
-            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-  vcl_size_t row_num = in.internal_size1();
-  vcl_size_t col_num = in.internal_size2() >> 1;
-
-  vcl_size_t size_mat = row_num * col_num;
-
-  std::vector<std::complex<NumericT> > input_complex(size_mat);
-  std::vector<std::complex<NumericT> > output(size_mat);
-
-  NumericT const * data_A = detail::extract_raw_pointer<NumericT>(in);
-  NumericT       * data_B = detail::extract_raw_pointer<NumericT>(out);
-
-  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size_mat);
-
-  fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order);
-
-  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], data_B, size_mat);
-}
-
-/*
- * This function performs reorder of 1D input  data. Indexes are sorted in bit-reversal order.
- * Such reordering should be done before in-place FFT.
- */
-template<typename NumericT, unsigned int AlignmentV>
-void reorder(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
-             vcl_size_t bits_datasize, vcl_size_t batch_num,
-             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-  std::vector<std::complex<NumericT> > input(size * batch_num);
-  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], in, size * batch_num);
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
-  {
-    vcl_size_t batch_id = vcl_size_t(batch_id2);
-    for (vcl_size_t i = 0; i < size; i++)
-    {
-      vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize);
-      if (i < v)
-      {
-        if (!data_order)
-        {
-          std::complex<NumericT> tmp   = input[batch_id * stride + i]; // index
-          input[batch_id * stride + i] = input[batch_id * stride + v]; //index
-          input[batch_id * stride + v] = tmp;      //index
-        }
-        else
-        {
-          std::complex<NumericT> tmp   = input[i * stride + batch_id]; // index
-          input[i * stride + batch_id] = input[v * stride + batch_id]; //index
-          input[v * stride + batch_id] = tmp;      //index
-        }
-      }
-    }
-  }
-  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], in, size * batch_num);
-}
-
-/*
- * This function performs reorder of 2D input  data. Indexes are sorted in bit-reversal order.
- * Such reordering should be done before in-place FFT.
- */
-template<typename NumericT, unsigned int AlignmentV>
-void reorder(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in,
-             vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num,
-             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-
-  NumericT * data = detail::extract_raw_pointer<NumericT>(in);
-  vcl_size_t row_num = in.internal_size1();
-  vcl_size_t col_num = in.internal_size2() >> 1;
-  vcl_size_t size_mat = row_num * col_num;
-
-  std::vector<std::complex<NumericT> > input(size_mat);
-
-  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], data, size_mat);
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
-  {
-    vcl_size_t batch_id = vcl_size_t(batch_id2);
-    for (vcl_size_t i = 0; i < size; i++)
-    {
-      vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize);
-      if (i < v)
-      {
-        if (!data_order)
-        {
-          std::complex<NumericT> tmp   = input[batch_id * stride + i]; // index
-          input[batch_id * stride + i] = input[batch_id * stride + v]; //index
-          input[batch_id * stride + v] = tmp;      //index
-        } else
-        {
-          std::complex<NumericT> tmp   = input[i * stride + batch_id]; // index
-          input[i * stride + batch_id] = input[v * stride + batch_id]; //index
-          input[v * stride + batch_id] = tmp;      //index
-        }
-      }
-    }
-  }
-  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], data, size_mat);
-}
-
-/**
- * @brief Radix-2 algorithm for computing Fourier transformation.
- * Kernel for computing smaller amount of data
- */
-template<typename NumericT>
-void fft_radix2(std::complex<NumericT> * input_complex, vcl_size_t batch_num,
-                vcl_size_t bit_size, vcl_size_t size, vcl_size_t stride, NumericT sign,
-                viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-  NumericT const NUM_PI = NumericT(3.14159265358979323846);
-
-  for (vcl_size_t step = 0; step < bit_size; step++)
-  {
-    vcl_size_t ss = 1 << step;
-    vcl_size_t half_size = size >> 1;
-
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for
-#endif
-    for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
-    {
-      vcl_size_t batch_id = vcl_size_t(batch_id2);
-      for (vcl_size_t tid = 0; tid < half_size; tid++)
-      {
-        vcl_size_t group = (tid & (ss - 1));
-        vcl_size_t pos = ((tid >> step) << (step + 1)) + group;
-        std::complex<NumericT> in1;
-        std::complex<NumericT> in2;
-        vcl_size_t offset;
-        if (!data_order)
-        {
-          offset = batch_id * stride + pos;
-          in1 = input_complex[offset];
-          in2 = input_complex[offset + ss];
-        }
-        else
-        {
-          offset = pos * stride + batch_id;
-          in1 = input_complex[offset];
-          in2 = input_complex[offset + ss * stride];
-        }
-        NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss);
-        NumericT sn = std::sin(arg);
-        NumericT cs = std::cos(arg);
-        std::complex<NumericT> ex(cs, sn);
-        std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
-                                   in2.real() * ex.imag() + in2.imag() * ex.real());
-        if (!data_order)
-          input_complex[offset + ss] = in1 - tmp;
-        else
-          input_complex[offset + ss * stride] = in1 - tmp;
-        input_complex[offset] = in1 + tmp;
-      }
-    }
-  }
-
-}
-
-/**
- * @brief Radix-2 algorithm for computing Fourier transformation.
- * Kernel for computing bigger amount of data
- */
-template<typename NumericT>
-void fft_radix2_local(std::complex<NumericT> * input_complex,
-                      std::complex<NumericT> * lcl_input, vcl_size_t batch_num, vcl_size_t bit_size,
-                      vcl_size_t size, vcl_size_t stride, NumericT sign,
-                      viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-  NumericT const NUM_PI = NumericT(3.14159265358979323846);
-
-  for (vcl_size_t batch_id = 0; batch_id < batch_num; batch_id++)
-  {
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for
-#endif
-    for (long p2 = 0; p2 < long(size); p2 += 1)
-    {
-      vcl_size_t p = vcl_size_t(p2);
-      vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(p, bit_size);
-
-      if (!data_order)
-        lcl_input[v] = input_complex[batch_id * stride + p]; //index
-      else
-        lcl_input[v] = input_complex[p * stride + batch_id];
-    }
-
-    for (vcl_size_t s = 0; s < bit_size; s++)
-    {
-      vcl_size_t ss = 1 << s;
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for
-#endif
-      for (long tid2 = 0; tid2 < long(size)/2; tid2++)
-      {
-        vcl_size_t tid = vcl_size_t(tid2);
-        vcl_size_t group = (tid & (ss - 1));
-        vcl_size_t pos = ((tid >> s) << (s + 1)) + group;
-
-        std::complex<NumericT> in1 = lcl_input[pos];
-        std::complex<NumericT> in2 = lcl_input[pos + ss];
-
-        NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss);
-
-        NumericT sn = std::sin(arg);
-        NumericT cs = std::cos(arg);
-        std::complex<NumericT> ex(cs, sn);
-
-        std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
-                                   in2.real() * ex.imag() + in2.imag() * ex.real());
-
-        lcl_input[pos + ss] = in1 - tmp;
-        lcl_input[pos] = in1 + tmp;
-      }
-
-    }
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for
-#endif
-    //copy local array back to global memory
-    for (long p2 = 0; p2 < long(size); p2 += 1)
-    {
-      vcl_size_t p = vcl_size_t(p2);
-      if (!data_order)
-        input_complex[batch_id * stride + p] = lcl_input[p];
-      else
-        input_complex[p * stride + batch_id] = lcl_input[p];
-
-    }
-
-  }
-
-}
-
-/**
- * @brief Radix-2 1D algorithm for computing Fourier transformation.
- *
- * Works only on power-of-two sizes of data.
- * Serial implementation has o(n * lg n) complexity.
- * This is a Cooley-Tukey algorithm
- */
-template<typename NumericT, unsigned int AlignmentV>
-void radix2(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
-            vcl_size_t batch_num, NumericT sign = NumericT(-1),
-            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-
-  vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size);
-
-  std::vector<std::complex<NumericT> > input_complex(size * batch_num);
-  std::vector<std::complex<NumericT> > lcl_input(size * batch_num);
-  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
-
-  if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM)
-  {
-    viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order);
-  }
-  else
-  {
-    viennacl::linalg::host_based::reorder<NumericT>(in, size, stride, bit_size, batch_num, data_order);
-    viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
-    viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order);
-  }
-
-  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size * batch_num);
-}
-
-/**
- * @brief Radix-2 2D algorithm for computing Fourier transformation.
- *
- * Works only on power-of-two sizes of data.
- * Serial implementation has o(n * lg n) complexity.
- * This is a Cooley-Tukey algorithm
- */
-template<typename NumericT, unsigned int AlignmentV>
-void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in, vcl_size_t size,
-            vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
-            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-
-  vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size);
-
-  NumericT * data = detail::extract_raw_pointer<NumericT>(in);
-
-  vcl_size_t row_num = in.internal_size1();
-  vcl_size_t col_num = in.internal_size2() >> 1;
-  vcl_size_t size_mat = row_num * col_num;
-
-  std::vector<std::complex<NumericT> > input_complex(size_mat);
-
-  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat);
-  if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM)
-  {
-    //std::cout<<bit_size<<","<<size<<","<<stride<<","<<batch_num<<","<<size<<","<<sign<<","<<data_order<<std::endl;
-    std::vector<std::complex<NumericT> > lcl_input(size_mat);
-    viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order);
-  }
-  else
-  {
-    viennacl::linalg::host_based::reorder<NumericT>(in, size, stride, bit_size, batch_num, data_order);
-    viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat);
-    viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order);
-  }
-
-  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size_mat);
-
-}
-
-/**
- * @brief Bluestein's algorithm for computing Fourier transformation.
- *
- * Currently,  Works only for sizes of input data which less than 2^16.
- * Uses a lot of additional memory, but should be fast for any size of data.
- * Serial implementation has something about o(n * lg n) complexity
- */
-template<typename NumericT, unsigned int AlignmentV>
-void bluestein(viennacl::vector<NumericT, AlignmentV>& in, viennacl::vector<NumericT, AlignmentV>& out, vcl_size_t /*batch_num*/)
-{
-
-  vcl_size_t size = in.size() >> 1;
-  vcl_size_t ext_size = viennacl::linalg::host_based::detail::fft::next_power_2(2 * size - 1);
-
-  viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
-  viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
-  viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
-
-  std::vector<std::complex<NumericT> > input_complex(size);
-  std::vector<std::complex<NumericT> > output_complex(size);
-
-  std::vector<std::complex<NumericT> > A_complex(ext_size);
-  std::vector<std::complex<NumericT> > B_complex(ext_size);
-  std::vector<std::complex<NumericT> > Z_complex(ext_size);
-
-  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size);
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2 = 0; i2 < long(ext_size); i2++)
-  {
-    vcl_size_t i = vcl_size_t(i2);
-    A_complex[i] = 0;
-    B_complex[i] = 0;
-  }
-
-  vcl_size_t double_size = size << 1;
-
-  NumericT const NUM_PI = NumericT(3.14159265358979323846);
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2 = 0; i2 < long(size); i2++)
-  {
-    vcl_size_t i = vcl_size_t(i2);
-    vcl_size_t rm = i * i % (double_size);
-    NumericT angle = NumericT(rm) / NumericT(size) * NumericT(NUM_PI);
-
-    NumericT sn_a = std::sin(-angle);
-    NumericT cs_a = std::cos(-angle);
-
-    std::complex<NumericT> a_i(cs_a, sn_a);
-    std::complex<NumericT> b_i(cs_a, -sn_a);
-
-    A_complex[i] = std::complex<NumericT>(input_complex[i].real() * a_i.real() - input_complex[i].imag() * a_i.imag(),
-                                          input_complex[i].real() * a_i.imag() + input_complex[i].imag() * a_i.real());
-    B_complex[i] = b_i;
-
-    // very bad instruction, to be fixed
-    if (i)
-      B_complex[ext_size - i] = b_i;
-  }
-
-  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size);
-  viennacl::linalg::host_based::detail::fft::copy_to_vector(&A_complex[0], A, ext_size);
-  viennacl::linalg::host_based::detail::fft::copy_to_vector(&B_complex[0], B, ext_size);
-
-  viennacl::linalg::convolve_i(A, B, Z);
-
-  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&Z_complex[0], Z, ext_size);
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2 = 0; i2 < long(size); i2++)
-  {
-    vcl_size_t i = vcl_size_t(i2);
-    vcl_size_t rm = i * i % (double_size);
-    NumericT angle = NumericT(rm) / NumericT(size) * NumericT(-NUM_PI);
-    NumericT sn_a = std::sin(angle);
-    NumericT cs_a = std::cos(angle);
-    std::complex<NumericT> b_i(cs_a, sn_a);
-    output_complex[i] = std::complex<NumericT>(Z_complex[i].real() * b_i.real() - Z_complex[i].imag() * b_i.imag(),
-                                               Z_complex[i].real() * b_i.imag() + Z_complex[i].imag() * b_i.real());
-  }
-  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], out, size);
-
-}
-
-/**
- * @brief Normalize vector with his own size
- */
-template<typename NumericT, unsigned int AlignmentV>
-void normalize(viennacl::vector<NumericT, AlignmentV> & input)
-{
-  vcl_size_t size = input.size() >> 1;
-  NumericT norm_factor = static_cast<NumericT>(size);
-  for (vcl_size_t i = 0; i < size * 2; i++)
-    input[i] /= norm_factor;
-
-}
-
-/**
- * @brief Complex multiplikation of two vectors
- */
-template<typename NumericT, unsigned int AlignmentV>
-void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
-                      viennacl::vector<NumericT, AlignmentV> const & input2,
-                      viennacl::vector<NumericT, AlignmentV> & output)
-{
-  vcl_size_t size = input1.size() >> 1;
-
-  std::vector<std::complex<NumericT> > input1_complex(size);
-  std::vector<std::complex<NumericT> > input2_complex(size);
-  std::vector<std::complex<NumericT> > output_complex(size);
-  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input1_complex[0], input1, size);
-  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input2_complex[0], input2, size);
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2 = 0; i2 < long(size); i2++)
-  {
-    vcl_size_t i = vcl_size_t(i2);
-    std::complex<NumericT> in1 = input1_complex[i];
-    std::complex<NumericT> in2 = input2_complex[i];
-    output_complex[i] = std::complex<NumericT>(in1.real() * in2.real() - in1.imag() * in2.imag(),
-                                               in1.real() * in2.imag() + in1.imag() * in2.real());
-  }
-  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], output, size);
-
-}
-/**
- * @brief Inplace transpose of matrix
- */
-template<typename NumericT, unsigned int AlignmentV>
-void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
-{
-  vcl_size_t row_num = input.internal_size1() / 2;
-  vcl_size_t col_num = input.internal_size2() / 2;
-
-  vcl_size_t size = row_num * col_num;
-
-  NumericT * data = detail::extract_raw_pointer<NumericT>(input);
-
-  std::vector<std::complex<NumericT> > input_complex(size);
-
-  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size);
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2 = 0; i2 < long(size); i2++)
-  {
-    vcl_size_t i = vcl_size_t(i2);
-    vcl_size_t row = i / col_num;
-    vcl_size_t col = i - row * col_num;
-    vcl_size_t new_pos = col * row_num + row;
-
-    if (i < new_pos)
-    {
-      std::complex<NumericT> val = input_complex[i];
-      input_complex[i] = input_complex[new_pos];
-      input_complex[new_pos] = val;
-    }
-  }
-  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size);
-
-}
-
-/**
- * @brief Transpose matrix
- */
-template<typename NumericT, unsigned int AlignmentV>
-void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
-               viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>       & output)
-{
-
-  vcl_size_t row_num = input.internal_size1() / 2;
-  vcl_size_t col_num = input.internal_size2() / 2;
-  vcl_size_t size = row_num * col_num;
-
-  NumericT const * data_A = detail::extract_raw_pointer<NumericT>(input);
-  NumericT       * data_B = detail::extract_raw_pointer<NumericT>(output);
-
-  std::vector<std::complex<NumericT> > input_complex(size);
-  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size);
-
-  std::vector<std::complex<NumericT> > output_complex(size);
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2 = 0; i2 < long(size); i2++)
-  {
-    vcl_size_t i = vcl_size_t(i2);
-    vcl_size_t row = i / col_num;
-    vcl_size_t col = i % col_num;
-    vcl_size_t new_pos = col * row_num + row;
-    output_complex[new_pos] = input_complex[i];
-  }
-  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], data_B, size);
-}
-
-/**
- * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
- */
-template<typename NumericT>
-void real_to_complex(viennacl::vector_base<NumericT> const & in,
-                     viennacl::vector_base<NumericT>       & out, vcl_size_t size)
-{
-  NumericT const * data_in  = detail::extract_raw_pointer<NumericT>(in);
-  NumericT       * data_out = detail::extract_raw_pointer<NumericT>(out);
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  for (long i2 = 0; i2 < long(size); i2++)
-  {
-    vcl_size_t i = static_cast<vcl_size_t>(i2);
-    data_out[2*i  ] = data_in[i];
-    data_out[2*i+1] = NumericT(0);
-  }
-}
-
-/**
- * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
- */
-template<typename NumericT>
-void complex_to_real(viennacl::vector_base<NumericT> const & in,
-                     viennacl::vector_base<NumericT>       & out, vcl_size_t size)
-{
-  NumericT const * data_in  = detail::extract_raw_pointer<NumericT>(in);
-  NumericT       * data_out = detail::extract_raw_pointer<NumericT>(out);
-
-#ifdef VIENNACL_WITH_OPENMP
-#pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  for (long i = 0; i < long(size); i++)
-    data_out[i] = data_in[2*i];
-}
-
-/**
- * @brief Reverse vector to opposite order and save it in input vector
- */
-template<typename NumericT>
-void reverse(viennacl::vector_base<NumericT> & in)
-{
-  vcl_size_t size = in.size();
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  for (long i2 = 0; i2 < long(size); i2++)
-  {
-    vcl_size_t i = vcl_size_t(i2);
-    NumericT val1 = in[i];
-    NumericT val2 = in[size - i - 1];
-    in[i] = val2;
-    in[size - i - 1] = val1;
-  }
-}
-
-}      //namespace host_based
-}      //namespace linalg
-}      //namespace viennacl
-
-#endif /* FFT_OPERATIONS_HPP_ */

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
deleted file mode 100644
index 62c885a..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
+++ /dev/null
@@ -1,672 +0,0 @@
-#ifndef VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_
-#define VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
-
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/linalg/host_based/ilu_operations.hpp
-    @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using the host (OpenMP)
-*/
-
-#include <cmath>
-#include <algorithm>  //for std::max and std::min
-
-#include "viennacl/forwards.h"
-#include "viennacl/scalar.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/meta/predicate.hpp"
-#include "viennacl/meta/enable_if.hpp"
-#include "viennacl/traits/size.hpp"
-#include "viennacl/traits/start.hpp"
-#include "viennacl/linalg/host_based/common.hpp"
-#include "viennacl/linalg/vector_operations.hpp"
-#include "viennacl/traits/stride.hpp"
-
-
-// Minimum vector size for using OpenMP on vector operations:
-#ifndef VIENNACL_OPENMP_ILU_MIN_SIZE
-  #define VIENNACL_OPENMP_ILU_MIN_SIZE  5000
-#endif
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace host_based
-{
-
-template<typename NumericT>
-void extract_L(compressed_matrix<NumericT> const & A,
-               compressed_matrix<NumericT>       & L)
-{
-  // L is known to have correct dimensions
-
-  unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
-  unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
-  NumericT     const *A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
-
-  unsigned int       *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
-
-  //
-  // Step 1: Count elements in L
-  //
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
-  {
-    unsigned int col_begin = A_row_buffer[row];
-    unsigned int col_end   = A_row_buffer[row+1];
-
-    for (unsigned int j = col_begin; j < col_end; ++j)
-    {
-      unsigned int col = A_col_buffer[j];
-      if (long(col) <= row)
-        ++L_row_buffer[row];
-    }
-  }
-
-  //
-  // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices
-  //
-  viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1);
-  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer);
-  L.reserve(wrapped_L_row_buffer[L.size1()], false);
-
-  unsigned int       *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
-  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
-
-  //
-  // Step 3: Write entries:
-  //
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
-  {
-    unsigned int col_begin = A_row_buffer[row];
-    unsigned int col_end   = A_row_buffer[row+1];
-
-    unsigned int index_L = L_row_buffer[row];
-    for (unsigned int j = col_begin; j < col_end; ++j)
-    {
-      unsigned int col = A_col_buffer[j];
-      NumericT value = A_elements[j];
-
-      if (long(col) <= row)
-      {
-        L_col_buffer[index_L] = col;
-        L_elements[index_L]   = value;
-        ++index_L;
-      }
-    }
-  }
-
-} // extract_L
-
-
-/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
-template<typename NumericT>
-void icc_scale(compressed_matrix<NumericT> const & A,
-               compressed_matrix<NumericT>       & L)
-{
-  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
-
-  unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
-  unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
-  NumericT     const *A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
-
-  NumericT           *D_elements   = detail::extract_raw_pointer<NumericT>(D.handle());
-
-  //
-  // Step 1: Determine D
-  //
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
-  {
-    unsigned int col_begin = A_row_buffer[row];
-    unsigned int col_end   = A_row_buffer[row+1];
-
-    for (unsigned int j = col_begin; j < col_end; ++j)
-    {
-      unsigned int col = A_col_buffer[j];
-      if (row == col)
-      {
-        D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j]));
-        break;
-      }
-    }
-  }
-
-  //
-  // Step 2: Scale values in L:
-  //
-  unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
-  unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
-  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
-  {
-    unsigned int col_begin = L_row_buffer[row];
-    unsigned int col_end   = L_row_buffer[row+1];
-
-    NumericT D_row = D_elements[row];
-
-    for (unsigned int j = col_begin; j < col_end; ++j)
-      L_elements[j] *= D_row * D_elements[L_col_buffer[j]];
-  }
-
-  L.generate_row_block_information();
-}
-
-
-
-/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ICC using OpenMP (cf. Algorithm 3 in paper, but for L rather than U) */
-template<typename NumericT>
-void icc_chow_patel_sweep(compressed_matrix<NumericT> & L,
-                          vector<NumericT>            & aij_L)
-{
-  unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
-  unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
-  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
-
-  NumericT           *aij_ptr   = detail::extract_raw_pointer<NumericT>(aij_L.handle());
-
-  // temporary workspace
-  NumericT *L_backup = (NumericT *)malloc(sizeof(NumericT) * L.nnz());
-
-  // backup:
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long i = 0; i < static_cast<long>(L.nnz()); ++i)
-    L_backup[i] = L_elements[i];
-
-
-  // sweep
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long row = 0; row < static_cast<long>(L.size1()); ++row)
-  {
-    //
-    // update L:
-    //
-    unsigned int row_Li_start = L_row_buffer[row];
-    unsigned int row_Li_end   = L_row_buffer[row + 1];
-
-    for (unsigned int i = row_Li_start; i < row_Li_end; ++i)
-    {
-      unsigned int col = L_col_buffer[i];
-
-      unsigned int row_Lj_start = L_row_buffer[col];
-      unsigned int row_Lj_end   = L_row_buffer[col+1];
-
-      // compute \sum_{k=1}^{j-1} l_ik l_jk
-      unsigned int index_Lj = row_Lj_start;
-      unsigned int col_Lj = L_col_buffer[index_Lj];
-      NumericT s = aij_ptr[i];
-      for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li)
-      {
-        unsigned int col_Li = L_col_buffer[index_Li];
-
-        // find element in row j
-        while (col_Lj < col_Li)
-        {
-          ++index_Lj;
-          col_Lj = L_col_buffer[index_Lj];
-        }
-
-        if (col_Lj == col_Li)
-          s -= L_backup[index_Li] * L_backup[index_Lj];
-      }
-
-      if (row != col)
-        L_elements[i] = s / L_backup[row_Lj_end - 1]; // diagonal element is last in row!
-      else
-        L_elements[i] = std::sqrt(s);
-    }
-  }
-
-  free(L_backup);
-}
-
-
-
-//////////////////////// ILU ////////////////////////
-
-template<typename NumericT>
-void extract_LU(compressed_matrix<NumericT> const & A,
-                compressed_matrix<NumericT>       & L,
-                compressed_matrix<NumericT>       & U)
-{
-  // L and U are known to have correct dimensions
-
-  unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
-  unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
-  NumericT     const *A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
-
-  unsigned int       *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
-  unsigned int       *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
-
-  //
-  // Step 1: Count elements in L and U
-  //
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
-  {
-    unsigned int col_begin = A_row_buffer[row];
-    unsigned int col_end   = A_row_buffer[row+1];
-
-    for (unsigned int j = col_begin; j < col_end; ++j)
-    {
-      unsigned int col = A_col_buffer[j];
-      if (long(col) <= row)
-        ++L_row_buffer[row];
-      if (long(col) >= row)
-        ++U_row_buffer[row];
-    }
-  }
-
-  //
-  // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices
-  //
-  viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1);
-  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer);
-  L.reserve(wrapped_L_row_buffer[L.size1()], false);
-
-  viennacl::vector_base<unsigned int> wrapped_U_row_buffer(U.handle1(), U.size1() + 1, 0, 1);
-  viennacl::linalg::exclusive_scan(wrapped_U_row_buffer);
-  U.reserve(wrapped_U_row_buffer[U.size1()], false);
-
-  unsigned int       *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
-  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
-
-  unsigned int       *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
-  NumericT           *U_elements   = detail::extract_raw_pointer<NumericT>(U.handle());
-
-  //
-  // Step 3: Write entries:
-  //
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
-  {
-    unsigned int col_begin = A_row_buffer[row];
-    unsigned int col_end   = A_row_buffer[row+1];
-
-    unsigned int index_L = L_row_buffer[row];
-    unsigned int index_U = U_row_buffer[row];
-    for (unsigned int j = col_begin; j < col_end; ++j)
-    {
-      unsigned int col = A_col_buffer[j];
-      NumericT value = A_elements[j];
-
-      if (long(col) <= row)
-      {
-        L_col_buffer[index_L] = col;
-        L_elements[index_L]   = value;
-        ++index_L;
-      }
-
-      if (long(col) >= row)
-      {
-        U_col_buffer[index_U] = col;
-        U_elements[index_U]   = value;
-        ++index_U;
-      }
-    }
-  }
-
-} // extract_LU
-
-
-
-/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
-template<typename NumericT>
-void ilu_scale(compressed_matrix<NumericT> const & A,
-               compressed_matrix<NumericT>       & L,
-               compressed_matrix<NumericT>       & U)
-{
-  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
-
-  unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
-  unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
-  NumericT     const *A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
-
-  NumericT           *D_elements   = detail::extract_raw_pointer<NumericT>(D.handle());
-
-  //
-  // Step 1: Determine D
-  //
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
-  {
-    unsigned int col_begin = A_row_buffer[row];
-    unsigned int col_end   = A_row_buffer[row+1];
-
-    for (unsigned int j = col_begin; j < col_end; ++j)
-    {
-      unsigned int col = A_col_buffer[j];
-      if (row == col)
-      {
-        D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j]));
-        break;
-      }
-    }
-  }
-
-  //
-  // Step 2: Scale values in L:
-  //
-  unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
-  unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
-  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
-  {
-    unsigned int col_begin = L_row_buffer[row];
-    unsigned int col_end   = L_row_buffer[row+1];
-
-    NumericT D_row = D_elements[row];
-
-    for (unsigned int j = col_begin; j < col_end; ++j)
-      L_elements[j] *= D_row * D_elements[L_col_buffer[j]];
-  }
-
-  //
-  // Step 3: Scale values in U:
-  //
-  unsigned int const *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
-  unsigned int const *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
-  NumericT           *U_elements   = detail::extract_raw_pointer<NumericT>(U.handle());
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
-  {
-    unsigned int col_begin = U_row_buffer[row];
-    unsigned int col_end   = U_row_buffer[row+1];
-
-    NumericT D_row = D_elements[row];
-
-    for (unsigned int j = col_begin; j < col_end; ++j)
-      U_elements[j] *= D_row * D_elements[U_col_buffer[j]];
-  }
-
-  L.generate_row_block_information();
-  // Note: block information for U will be generated after transposition
-
-}
-
-template<typename NumericT>
-void ilu_transpose(compressed_matrix<NumericT> const & A,
-                   compressed_matrix<NumericT>       & B)
-{
-  NumericT     const * A_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
-  unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
-  unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
-
-  // initialize datastructures for B:
-  B = compressed_matrix<NumericT>(A.size2(), A.size1(), A.nnz(), viennacl::traits::context(A));
-
-  NumericT     * B_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(B.handle());
-  unsigned int * B_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle1());
-  unsigned int * B_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle2());
-
-  // prepare uninitialized B_row_buffer:
-  for (std::size_t i = 0; i < B.size1(); ++i)
-    B_row_buffer[i] = 0;
-
-  //
-  // Stage 1: Compute pattern for B
-  //
-  for (std::size_t row = 0; row < A.size1(); ++row)
-  {
-    unsigned int row_start = A_row_buffer[row];
-    unsigned int row_stop  = A_row_buffer[row+1];
-
-    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
-      B_row_buffer[A_col_buffer[nnz_index]] += 1;
-  }
-
-  // Bring row-start array in place using exclusive-scan:
-  unsigned int offset = B_row_buffer[0];
-  B_row_buffer[0] = 0;
-  for (std::size_t row = 1; row < B.size1(); ++row)
-  {
-    unsigned int tmp = B_row_buffer[row];
-    B_row_buffer[row] = offset;
-    offset += tmp;
-  }
-  B_row_buffer[B.size1()] = offset;
-
-  //
-  // Stage 2: Fill with data
-  //
-
-  std::vector<unsigned int> B_row_offsets(B.size1()); //number of elements already written per row
-
-  for (unsigned int row = 0; row < static_cast<unsigned int>(A.size1()); ++row)
-  {
-    //std::cout << "Row " << row << ": ";
-    unsigned int row_start = A_row_buffer[row];
-    unsigned int row_stop  = A_row_buffer[row+1];
-
-    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
-    {
-      unsigned int col_in_A = A_col_buffer[nnz_index];
-      unsigned int B_nnz_index = B_row_buffer[col_in_A] + B_row_offsets[col_in_A];
-      B_col_buffer[B_nnz_index] = row;
-      B_elements[B_nnz_index] = A_elements[nnz_index];
-      ++B_row_offsets[col_in_A];
-      //B_temp.at(A_col_buffer[nnz_index])[row] = A_elements[nnz_index];
-    }
-  }
-
-  // Step 3: Make datastructure consistent (row blocks!)
-  B.generate_row_block_information();
-}
-
-
-
-/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */
-template<typename NumericT>
-void ilu_chow_patel_sweep(compressed_matrix<NumericT>       & L,
-                          vector<NumericT>            const & aij_L,
-                          compressed_matrix<NumericT>       & U_trans,
-                          vector<NumericT>            const & aij_U_trans)
-{
-  unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
-  unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
-  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
-
-  NumericT     const *aij_L_ptr    = detail::extract_raw_pointer<NumericT>(aij_L.handle());
-
-  unsigned int const *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U_trans.handle1());
-  unsigned int const *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U_trans.handle2());
-  NumericT           *U_elements   = detail::extract_raw_pointer<NumericT>(U_trans.handle());
-
-  NumericT     const *aij_U_trans_ptr = detail::extract_raw_pointer<NumericT>(aij_U_trans.handle());
-
-  // temporary workspace
-  NumericT *L_backup = new NumericT[L.nnz()];
-  NumericT *U_backup = new NumericT[U_trans.nnz()];
-
-  // backup:
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long i = 0; i < static_cast<long>(L.nnz()); ++i)
-    L_backup[i] = L_elements[i];
-
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for if (U_trans.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long i = 0; i < static_cast<long>(U_trans.nnz()); ++i)
-    U_backup[i] = U_elements[i];
-
-  // sweep
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long row = 0; row < static_cast<long>(L.size1()); ++row)
-  {
-    //
-    // update L:
-    //
-    unsigned int row_L_start = L_row_buffer[row];
-    unsigned int row_L_end   = L_row_buffer[row + 1];
-
-    for (unsigned int j = row_L_start; j < row_L_end; ++j)
-    {
-      unsigned int col = L_col_buffer[j];
-
-      if (col == row)
-        continue;
-
-      unsigned int row_U_start = U_row_buffer[col];
-      unsigned int row_U_end   = U_row_buffer[col + 1];
-
-      // compute \sum_{k=1}^{j-1} l_ik u_kj
-      unsigned int index_U = row_U_start;
-      unsigned int col_U = (index_U < row_U_end) ? U_col_buffer[index_U] : static_cast<unsigned int>(U_trans.size2());
-      NumericT sum = 0;
-      for (unsigned int k = row_L_start; k < j; ++k)
-      {
-        unsigned int col_L = L_col_buffer[k];
-
-        // find element in U
-        while (col_U < col_L)
-        {
-          ++index_U;
-          col_U = U_col_buffer[index_U];
-        }
-
-        if (col_U == col_L)
-          sum += L_backup[k] * U_backup[index_U];
-      }
-
-      // update l_ij:
-      assert(U_col_buffer[row_U_end - 1] == col && bool("Accessing U element which is not a diagonal element!"));
-      L_elements[j] = (aij_L_ptr[j] - sum) / U_backup[row_U_end - 1];  // diagonal element is last entry in U
-    }
-
-
-    //
-    // update U:
-    //
-    unsigned int row_U_start = U_row_buffer[row];
-    unsigned int row_U_end   = U_row_buffer[row + 1];
-    for (unsigned int j = row_U_start; j < row_U_end; ++j)
-    {
-      unsigned int col = U_col_buffer[j];
-
-      row_L_start = L_row_buffer[col];
-      row_L_end   = L_row_buffer[col + 1];
-
-      // compute \sum_{k=1}^{j-1} l_ik u_kj
-      unsigned int index_L = row_L_start;
-      unsigned int col_L = (index_L < row_L_end) ? L_col_buffer[index_L] : static_cast<unsigned int>(L.size1());
-      NumericT sum = 0;
-      for (unsigned int k = row_U_start; k < j; ++k)
-      {
-        unsigned int col_U = U_col_buffer[k];
-
-        // find element in L
-        while (col_L < col_U)
-        {
-          ++index_L;
-          col_L = L_col_buffer[index_L];
-        }
-
-        if (col_U == col_L)
-          sum += L_backup[index_L] * U_backup[k];
-      }
-
-      // update u_ij:
-      U_elements[j] = aij_U_trans_ptr[j] - sum;
-    }
-  }
-
-  delete[] L_backup;
-  delete[] U_backup;
-}
-
-
-template<typename NumericT>
-void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
-                             vector<NumericT> & diag_R)
-{
-  unsigned int *R_row_buffer = detail::extract_raw_pointer<unsigned int>(R.handle1());
-  unsigned int *R_col_buffer = detail::extract_raw_pointer<unsigned int>(R.handle2());
-  NumericT     *R_elements   = detail::extract_raw_pointer<NumericT>(R.handle());
-
-  NumericT     *diag_R_ptr   = detail::extract_raw_pointer<NumericT>(diag_R.handle());
-
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for if (R.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
-  for (long row = 0; row < static_cast<long>(R.size1()); ++row)
-  {
-    unsigned int col_begin = R_row_buffer[row];
-    unsigned int col_end   = R_row_buffer[row+1];
-
-    // part 1: extract diagonal entry
-    NumericT diag = 0;
-    for (unsigned int j = col_begin; j < col_end; ++j)
-    {
-      unsigned int col = R_col_buffer[j];
-      if (col == row)
-      {
-        diag = R_elements[j];
-        R_elements[j] = 0; // (I - D^{-1}R)
-        break;
-      }
-    }
-    diag_R_ptr[row] = diag;
-
-    assert((diag > 0 || diag < 0) && bool("Zero diagonal detected!"));
-
-    // part2: scale
-    for (unsigned int j = col_begin; j < col_end; ++j)
-      R_elements[j] /= -diag;
-  }
-
-  //std::cout << "diag_R: " << diag_R << std::endl;
-}
-
-} //namespace host_based
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif