You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/10 16:52:27 UTC
[22/51] [partial] mahout git commit: Revert "(nojira) add
native-viennaCL module to codebase. closes apache/mahout#241"
http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
deleted file mode 100644
index 1d212c2..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
+++ /dev/null
@@ -1,307 +0,0 @@
-#ifndef VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP
-#define VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP
-
-/* =========================================================================
- Copyright (c) 2010-2016, Institute for Microelectronics,
- Institute for Analysis and Scientific Computing,
- TU Wien.
- Portions of this software are copyright by UChicago Argonne, LLC.
-
- -----------------
- ViennaCL - The Vienna Computing Library
- -----------------
-
- Project Head: Karl Rupp rupp@iue.tuwien.ac.at
-
- (A list of authors and contributors can be found in the manual)
-
- License: MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/linalg/host_based/direct_solve.hpp
- @brief Implementations of dense direct triangular solvers are found here.
-*/
-
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-
-#include "viennacl/linalg/host_based/common.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace host_based
-{
-
-namespace detail
-{
- //
- // Upper solve:
- //
- template<typename MatrixT1, typename MatrixT2>
- void upper_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal)
- {
- typedef typename MatrixT2::value_type value_type;
-
- for (vcl_size_t i = 0; i < A_size; ++i)
- {
- vcl_size_t current_row = A_size - i - 1;
-
- for (vcl_size_t j = current_row + 1; j < A_size; ++j)
- {
- value_type A_element = A(current_row, j);
- for (vcl_size_t k=0; k < B_size; ++k)
- B(current_row, k) -= A_element * B(j, k);
- }
-
- if (!unit_diagonal)
- {
- value_type A_diag = A(current_row, current_row);
- for (vcl_size_t k=0; k < B_size; ++k)
- B(current_row, k) /= A_diag;
- }
- }
- }
-
- template<typename MatrixT1, typename MatrixT2>
- void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_upper_tag)
- {
- upper_inplace_solve_matrix(A, B, A_size, B_size, true);
- }
-
- template<typename MatrixT1, typename MatrixT2>
- void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::upper_tag)
- {
- upper_inplace_solve_matrix(A, B, A_size, B_size, false);
- }
-
- //
- // Lower solve:
- //
- template<typename MatrixT1, typename MatrixT2>
- void lower_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal)
- {
- typedef typename MatrixT2::value_type value_type;
-
- for (vcl_size_t i = 0; i < A_size; ++i)
- {
- for (vcl_size_t j = 0; j < i; ++j)
- {
- value_type A_element = A(i, j);
- for (vcl_size_t k=0; k < B_size; ++k)
- B(i, k) -= A_element * B(j, k);
- }
-
- if (!unit_diagonal)
- {
- value_type A_diag = A(i, i);
- for (vcl_size_t k=0; k < B_size; ++k)
- B(i, k) /= A_diag;
- }
- }
- }
-
- template<typename MatrixT1, typename MatrixT2>
- void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_lower_tag)
- {
- lower_inplace_solve_matrix(A, B, A_size, B_size, true);
- }
-
- template<typename MatrixT1, typename MatrixT2>
- void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::lower_tag)
- {
- lower_inplace_solve_matrix(A, B, A_size, B_size, false);
- }
-
-}
-
-//
-// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
-//
-
-////////////////// upper triangular solver (upper_tag) //////////////////////////////////////
-/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notation)
-*
-* @param A The system matrix
-* @param B The matrix of row vectors, where the solution is directly written to
-*/
-template<typename NumericT, typename SolverTagT>
-void inplace_solve(matrix_base<NumericT> const & A,
- matrix_base<NumericT> & B,
- SolverTagT)
-{
- typedef NumericT value_type;
-
- value_type const * data_A = detail::extract_raw_pointer<value_type>(A);
- value_type * data_B = detail::extract_raw_pointer<value_type>(B);
-
- vcl_size_t A_start1 = viennacl::traits::start1(A);
- vcl_size_t A_start2 = viennacl::traits::start2(A);
- vcl_size_t A_inc1 = viennacl::traits::stride1(A);
- vcl_size_t A_inc2 = viennacl::traits::stride2(A);
- //vcl_size_t A_size1 = viennacl::traits::size1(A);
- vcl_size_t A_size2 = viennacl::traits::size2(A);
- vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
- vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
-
- vcl_size_t B_start1 = viennacl::traits::start1(B);
- vcl_size_t B_start2 = viennacl::traits::start2(B);
- vcl_size_t B_inc1 = viennacl::traits::stride1(B);
- vcl_size_t B_inc2 = viennacl::traits::stride2(B);
- //vcl_size_t B_size1 = viennacl::traits::size1(B);
- vcl_size_t B_size2 = viennacl::traits::size2(B);
- vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(B);
- vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(B);
-
-
- if (A.row_major() && B.row_major())
- {
- detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
- detail::matrix_array_wrapper<value_type, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
-
- detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
- }
- else if (A.row_major() && !B.row_major())
- {
- detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
- detail::matrix_array_wrapper<value_type, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
-
- detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
- }
- else if (!A.row_major() && B.row_major())
- {
- detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
- detail::matrix_array_wrapper<value_type, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
-
- detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
- }
- else
- {
- detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
- detail::matrix_array_wrapper<value_type, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
-
- detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
- }
-}
-
-
-//
-// Solve on vector
-//
-
-namespace detail
-{
- //
- // Upper solve:
- //
- template<typename MatrixT, typename VectorT>
- void upper_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal)
- {
- typedef typename VectorT::value_type value_type;
-
- for (vcl_size_t i = 0; i < A_size; ++i)
- {
- vcl_size_t current_row = A_size - i - 1;
-
- for (vcl_size_t j = current_row + 1; j < A_size; ++j)
- {
- value_type A_element = A(current_row, j);
- b(current_row) -= A_element * b(j);
- }
-
- if (!unit_diagonal)
- b(current_row) /= A(current_row, current_row);
- }
- }
-
- template<typename MatrixT, typename VectorT>
- void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_upper_tag)
- {
- upper_inplace_solve_vector(A, b, A_size, true);
- }
-
- template<typename MatrixT, typename VectorT>
- void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::upper_tag)
- {
- upper_inplace_solve_vector(A, b, A_size, false);
- }
-
- //
- // Lower solve:
- //
- template<typename MatrixT, typename VectorT>
- void lower_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal)
- {
- typedef typename VectorT::value_type value_type;
-
- for (vcl_size_t i = 0; i < A_size; ++i)
- {
- for (vcl_size_t j = 0; j < i; ++j)
- {
- value_type A_element = A(i, j);
- b(i) -= A_element * b(j);
- }
-
- if (!unit_diagonal)
- b(i) /= A(i, i);
- }
- }
-
- template<typename MatrixT, typename VectorT>
- void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_lower_tag)
- {
- lower_inplace_solve_vector(A, b, A_size, true);
- }
-
- template<typename MatrixT, typename VectorT>
- void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::lower_tag)
- {
- lower_inplace_solve_vector(A, b, A_size, false);
- }
-
-}
-
-template<typename NumericT, typename SolverTagT>
-void inplace_solve(matrix_base<NumericT> const & mat,
- vector_base<NumericT> & vec,
- SolverTagT)
-{
- typedef NumericT value_type;
-
- value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
- value_type * data_v = detail::extract_raw_pointer<value_type>(vec);
-
- vcl_size_t A_start1 = viennacl::traits::start1(mat);
- vcl_size_t A_start2 = viennacl::traits::start2(mat);
- vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
- vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
- vcl_size_t A_size2 = viennacl::traits::size2(mat);
- vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
- vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
-
- vcl_size_t start1 = viennacl::traits::start(vec);
- vcl_size_t inc1 = viennacl::traits::stride(vec);
-
- if (mat.row_major())
- {
- detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
- detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1);
-
- detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT());
- }
- else
- {
- detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
- detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1);
-
- detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT());
- }
-}
-
-
-} // namespace host_based
-} // namespace linalg
-} // namespace viennacl
-
-#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
deleted file mode 100644
index f53f8f2..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
+++ /dev/null
@@ -1,856 +0,0 @@
-#ifndef VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_
-#define VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_
-
-/* =========================================================================
- Copyright (c) 2010-2016, Institute for Microelectronics,
- Institute for Analysis and Scientific Computing,
- TU Wien.
- Portions of this software are copyright by UChicago Argonne, LLC.
-
- -----------------
- ViennaCL - The Vienna Computing Library
- -----------------
-
- Project Head: Karl Rupp rupp@iue.tuwien.ac.at
-
- (A list of authors and contributors can be found in the manual)
-
- License: MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/linalg/host_based/fft_operations.hpp
- @brief Implementations of Fast Furier Transformation using a plain single-threaded or OpenMP-enabled execution on CPU
- */
-
-//TODO openom Conditions
-#include <viennacl/vector.hpp>
-#include <viennacl/matrix.hpp>
-
-#include "viennacl/linalg/host_based/vector_operations.hpp"
-
-#include <stdexcept>
-#include <cmath>
-#include <complex>
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace host_based
-{
-namespace detail
-{
- namespace fft
- {
- const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
-
- namespace FFT_DATA_ORDER
- {
- enum DATA_ORDER
- {
- ROW_MAJOR, COL_MAJOR
- };
- }
-
- inline vcl_size_t num_bits(vcl_size_t size)
- {
- vcl_size_t bits_datasize = 0;
- vcl_size_t ds = 1;
-
- while (ds < size)
- {
- ds = ds << 1;
- bits_datasize++;
- }
-
- return bits_datasize;
- }
-
- inline vcl_size_t next_power_2(vcl_size_t n)
- {
- n = n - 1;
-
- vcl_size_t power = 1;
-
- while (power < sizeof(vcl_size_t) * 8)
- {
- n = n | (n >> power);
- power *= 2;
- }
-
- return n + 1;
- }
-
- inline vcl_size_t get_reorder_num(vcl_size_t v, vcl_size_t bit_size)
- {
- v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
- v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
- v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
- v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
- v = (v >> 16) | (v << 16);
- v = v >> (32 - bit_size);
- return v;
- }
-
- template<typename NumericT, unsigned int AlignmentV>
- void copy_to_complex_array(std::complex<NumericT> * input_complex,
- viennacl::vector<NumericT, AlignmentV> const & in, vcl_size_t size)
- {
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
- for (long i2 = 0; i2 < long(size * 2); i2 += 2)
- { //change array to complex array
- vcl_size_t i = vcl_size_t(i2);
- input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
- }
- }
-
- template<typename NumericT>
- void copy_to_complex_array(std::complex<NumericT> * input_complex,
- viennacl::vector_base<NumericT> const & in, vcl_size_t size)
- {
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
- for (long i2 = 0; i2 < long(size * 2); i2 += 2)
- { //change array to complex array
- vcl_size_t i = vcl_size_t(i2);
- input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
- }
- }
-
- template<typename NumericT, unsigned int AlignmentV>
- void copy_to_vector(std::complex<NumericT> * input_complex,
- viennacl::vector<NumericT, AlignmentV> & in, vcl_size_t size)
- {
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
- for (long i2 = 0; i2 < long(size); i2++)
- {
- vcl_size_t i = vcl_size_t(i2);
- in(i * 2) = static_cast<NumericT>(std::real(input_complex[i]));
- in(i * 2 + 1) = static_cast<NumericT>(std::imag(input_complex[i]));
- }
- }
-
- template<typename NumericT>
- void copy_to_complex_array(std::complex<NumericT> * input_complex,
- NumericT const * in, vcl_size_t size)
- {
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
- for (long i2 = 0; i2 < long(size * 2); i2 += 2)
- { //change array to complex array
- vcl_size_t i = vcl_size_t(i2);
- input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
- }
- }
-
- template<typename NumericT>
- void copy_to_vector(std::complex<NumericT> * input_complex, NumericT * in, vcl_size_t size)
- {
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
- for (long i2 = 0; i2 < long(size); i2++)
- {
- vcl_size_t i = vcl_size_t(i2);
- in[i * 2] = static_cast<NumericT>(std::real(input_complex[i]));
- in[i * 2 + 1] = static_cast<NumericT>(std::imag(input_complex[i]));
- }
- }
-
- template<typename NumericT>
- void copy_to_vector(std::complex<NumericT> * input_complex,
- viennacl::vector_base<NumericT> & in, vcl_size_t size)
- {
- std::vector<NumericT> temp(2 * size);
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
- for (long i2 = 0; i2 < long(size); i2++)
- {
- vcl_size_t i = vcl_size_t(i2);
- temp[i * 2] = static_cast<NumericT>(std::real(input_complex[i]));
- temp[i * 2 + 1] = static_cast<NumericT>(std::imag(input_complex[i]));
- }
- viennacl::copy(temp, in);
- }
-
- template<typename NumericT>
- void zero2(NumericT *input1, NumericT *input2, vcl_size_t size)
- {
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
- for (long i2 = 0; i2 < long(size); i2++)
- {
- vcl_size_t i = vcl_size_t(i2);
- input1[i] = 0;
- input2[i] = 0;
- }
- }
-
- } //namespace fft
-
-} //namespace detail
-
-/**
- * @brief Direct algoritm kenrnel
- */
-template<typename NumericT>
-void fft_direct(std::complex<NumericT> * input_complex, std::complex<NumericT> * output,
- vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign,
- viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
- NumericT const NUM_PI = NumericT(3.14159265358979323846);
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel
-#endif
- for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
- {
- vcl_size_t batch_id = vcl_size_t(batch_id2);
- for (vcl_size_t k = 0; k < size; k += 1)
- {
- std::complex<NumericT> f = 0;
- for (vcl_size_t n = 0; n < size; n++)
- {
- std::complex<NumericT> input;
- if (!data_order)
- input = input_complex[batch_id * stride + n]; //input index here
- else
- input = input_complex[n * stride + batch_id];
- NumericT arg = sign * 2 * NUM_PI * NumericT(k) / NumericT(size * n);
- NumericT sn = std::sin(arg);
- NumericT cs = std::cos(arg);
-
- std::complex<NumericT> ex(cs, sn);
- std::complex<NumericT> tmp(input.real() * ex.real() - input.imag() * ex.imag(),
- input.real() * ex.imag() + input.imag() * ex.real());
- f = f + tmp;
- }
- if (!data_order)
- output[batch_id * stride + k] = f; // output index here
- else
- output[k * stride + batch_id] = f;
- }
- }
-
-}
-
-/**
- * @brief Direct 1D algorithm for computing Fourier transformation.
- *
- * Works on any sizes of data.
- * Serial implementation has o(n^2) complexity
- */
-template<typename NumericT, unsigned int AlignmentV>
-void direct(viennacl::vector<NumericT, AlignmentV> const & in,
- viennacl::vector<NumericT, AlignmentV> & out,
- vcl_size_t size, vcl_size_t stride,
- vcl_size_t batch_num, NumericT sign = NumericT(-1),
- viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
- std::vector<std::complex<NumericT> > input_complex(size * batch_num);
- std::vector<std::complex<NumericT> > output(size * batch_num);
-
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
-
- fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order);
-
- viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], out, size * batch_num);
-}
-
-/**
- * @brief Direct 2D algorithm for computing Fourier transformation.
- *
- * Works on any sizes of data.
- * Serial implementation has o(n^2) complexity
- */
-template<typename NumericT, unsigned int AlignmentV>
-void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in,
- viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & out, vcl_size_t size,
- vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
- viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
- vcl_size_t row_num = in.internal_size1();
- vcl_size_t col_num = in.internal_size2() >> 1;
-
- vcl_size_t size_mat = row_num * col_num;
-
- std::vector<std::complex<NumericT> > input_complex(size_mat);
- std::vector<std::complex<NumericT> > output(size_mat);
-
- NumericT const * data_A = detail::extract_raw_pointer<NumericT>(in);
- NumericT * data_B = detail::extract_raw_pointer<NumericT>(out);
-
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size_mat);
-
- fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order);
-
- viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], data_B, size_mat);
-}
-
-/*
- * This function performs reorder of 1D input data. Indexes are sorted in bit-reversal order.
- * Such reordering should be done before in-place FFT.
- */
-template<typename NumericT, unsigned int AlignmentV>
-void reorder(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
- vcl_size_t bits_datasize, vcl_size_t batch_num,
- viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
- std::vector<std::complex<NumericT> > input(size * batch_num);
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], in, size * batch_num);
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for
-#endif
- for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
- {
- vcl_size_t batch_id = vcl_size_t(batch_id2);
- for (vcl_size_t i = 0; i < size; i++)
- {
- vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize);
- if (i < v)
- {
- if (!data_order)
- {
- std::complex<NumericT> tmp = input[batch_id * stride + i]; // index
- input[batch_id * stride + i] = input[batch_id * stride + v]; //index
- input[batch_id * stride + v] = tmp; //index
- }
- else
- {
- std::complex<NumericT> tmp = input[i * stride + batch_id]; // index
- input[i * stride + batch_id] = input[v * stride + batch_id]; //index
- input[v * stride + batch_id] = tmp; //index
- }
- }
- }
- }
- viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], in, size * batch_num);
-}
-
-/*
- * This function performs reorder of 2D input data. Indexes are sorted in bit-reversal order.
- * Such reordering should be done before in-place FFT.
- */
-template<typename NumericT, unsigned int AlignmentV>
-void reorder(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in,
- vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num,
- viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-
- NumericT * data = detail::extract_raw_pointer<NumericT>(in);
- vcl_size_t row_num = in.internal_size1();
- vcl_size_t col_num = in.internal_size2() >> 1;
- vcl_size_t size_mat = row_num * col_num;
-
- std::vector<std::complex<NumericT> > input(size_mat);
-
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], data, size_mat);
-
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for
-#endif
- for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
- {
- vcl_size_t batch_id = vcl_size_t(batch_id2);
- for (vcl_size_t i = 0; i < size; i++)
- {
- vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize);
- if (i < v)
- {
- if (!data_order)
- {
- std::complex<NumericT> tmp = input[batch_id * stride + i]; // index
- input[batch_id * stride + i] = input[batch_id * stride + v]; //index
- input[batch_id * stride + v] = tmp; //index
- } else
- {
- std::complex<NumericT> tmp = input[i * stride + batch_id]; // index
- input[i * stride + batch_id] = input[v * stride + batch_id]; //index
- input[v * stride + batch_id] = tmp; //index
- }
- }
- }
- }
- viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], data, size_mat);
-}
-
-/**
- * @brief Radix-2 algorithm for computing Fourier transformation.
- * Kernel for computing smaller amount of data
- */
-template<typename NumericT>
-void fft_radix2(std::complex<NumericT> * input_complex, vcl_size_t batch_num,
- vcl_size_t bit_size, vcl_size_t size, vcl_size_t stride, NumericT sign,
- viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
- NumericT const NUM_PI = NumericT(3.14159265358979323846);
-
- for (vcl_size_t step = 0; step < bit_size; step++)
- {
- vcl_size_t ss = 1 << step;
- vcl_size_t half_size = size >> 1;
-
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for
-#endif
- for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
- {
- vcl_size_t batch_id = vcl_size_t(batch_id2);
- for (vcl_size_t tid = 0; tid < half_size; tid++)
- {
- vcl_size_t group = (tid & (ss - 1));
- vcl_size_t pos = ((tid >> step) << (step + 1)) + group;
- std::complex<NumericT> in1;
- std::complex<NumericT> in2;
- vcl_size_t offset;
- if (!data_order)
- {
- offset = batch_id * stride + pos;
- in1 = input_complex[offset];
- in2 = input_complex[offset + ss];
- }
- else
- {
- offset = pos * stride + batch_id;
- in1 = input_complex[offset];
- in2 = input_complex[offset + ss * stride];
- }
- NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss);
- NumericT sn = std::sin(arg);
- NumericT cs = std::cos(arg);
- std::complex<NumericT> ex(cs, sn);
- std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
- in2.real() * ex.imag() + in2.imag() * ex.real());
- if (!data_order)
- input_complex[offset + ss] = in1 - tmp;
- else
- input_complex[offset + ss * stride] = in1 - tmp;
- input_complex[offset] = in1 + tmp;
- }
- }
- }
-
-}
-
-/**
- * @brief Radix-2 algorithm for computing Fourier transformation.
- * Kernel for computing bigger amount of data
- */
-template<typename NumericT>
-void fft_radix2_local(std::complex<NumericT> * input_complex,
- std::complex<NumericT> * lcl_input, vcl_size_t batch_num, vcl_size_t bit_size,
- vcl_size_t size, vcl_size_t stride, NumericT sign,
- viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
- NumericT const NUM_PI = NumericT(3.14159265358979323846);
-
- for (vcl_size_t batch_id = 0; batch_id < batch_num; batch_id++)
- {
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for
-#endif
- for (long p2 = 0; p2 < long(size); p2 += 1)
- {
- vcl_size_t p = vcl_size_t(p2);
- vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(p, bit_size);
-
- if (!data_order)
- lcl_input[v] = input_complex[batch_id * stride + p]; //index
- else
- lcl_input[v] = input_complex[p * stride + batch_id];
- }
-
- for (vcl_size_t s = 0; s < bit_size; s++)
- {
- vcl_size_t ss = 1 << s;
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for
-#endif
- for (long tid2 = 0; tid2 < long(size)/2; tid2++)
- {
- vcl_size_t tid = vcl_size_t(tid2);
- vcl_size_t group = (tid & (ss - 1));
- vcl_size_t pos = ((tid >> s) << (s + 1)) + group;
-
- std::complex<NumericT> in1 = lcl_input[pos];
- std::complex<NumericT> in2 = lcl_input[pos + ss];
-
- NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss);
-
- NumericT sn = std::sin(arg);
- NumericT cs = std::cos(arg);
- std::complex<NumericT> ex(cs, sn);
-
- std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
- in2.real() * ex.imag() + in2.imag() * ex.real());
-
- lcl_input[pos + ss] = in1 - tmp;
- lcl_input[pos] = in1 + tmp;
- }
-
- }
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for
-#endif
- //copy local array back to global memory
- for (long p2 = 0; p2 < long(size); p2 += 1)
- {
- vcl_size_t p = vcl_size_t(p2);
- if (!data_order)
- input_complex[batch_id * stride + p] = lcl_input[p];
- else
- input_complex[p * stride + batch_id] = lcl_input[p];
-
- }
-
- }
-
-}
-
-/**
- * @brief Radix-2 1D algorithm for computing Fourier transformation.
- *
- * Works only on power-of-two sizes of data.
- * Serial implementation has o(n * lg n) complexity.
- * This is a Cooley-Tukey algorithm
- */
-template<typename NumericT, unsigned int AlignmentV>
-void radix2(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
- vcl_size_t batch_num, NumericT sign = NumericT(-1),
- viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-
- vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size);
-
- std::vector<std::complex<NumericT> > input_complex(size * batch_num);
- std::vector<std::complex<NumericT> > lcl_input(size * batch_num);
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
-
- if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM)
- {
- viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order);
- }
- else
- {
- viennacl::linalg::host_based::reorder<NumericT>(in, size, stride, bit_size, batch_num, data_order);
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
- viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order);
- }
-
- viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size * batch_num);
-}
-
-/**
- * @brief Radix-2 2D algorithm for computing Fourier transformation.
- *
- * Works only on power-of-two sizes of data.
- * Serial implementation has o(n * lg n) complexity.
- * This is a Cooley-Tukey algorithm
- */
-template<typename NumericT, unsigned int AlignmentV>
-void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in, vcl_size_t size,
- vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
- viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-
- vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size);
-
- NumericT * data = detail::extract_raw_pointer<NumericT>(in);
-
- vcl_size_t row_num = in.internal_size1();
- vcl_size_t col_num = in.internal_size2() >> 1;
- vcl_size_t size_mat = row_num * col_num;
-
- std::vector<std::complex<NumericT> > input_complex(size_mat);
-
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat);
- if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM)
- {
- //std::cout<<bit_size<<","<<size<<","<<stride<<","<<batch_num<<","<<size<<","<<sign<<","<<data_order<<std::endl;
- std::vector<std::complex<NumericT> > lcl_input(size_mat);
- viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order);
- }
- else
- {
- viennacl::linalg::host_based::reorder<NumericT>(in, size, stride, bit_size, batch_num, data_order);
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat);
- viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order);
- }
-
- viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size_mat);
-
-}
-
-/**
- * @brief Bluestein's algorithm for computing Fourier transformation.
- *
- * Currently, Works only for sizes of input data which less than 2^16.
- * Uses a lot of additional memory, but should be fast for any size of data.
- * Serial implementation has something about o(n * lg n) complexity
- */
-template<typename NumericT, unsigned int AlignmentV>
-void bluestein(viennacl::vector<NumericT, AlignmentV>& in, viennacl::vector<NumericT, AlignmentV>& out, vcl_size_t /*batch_num*/)
-{
-
- vcl_size_t size = in.size() >> 1;
- vcl_size_t ext_size = viennacl::linalg::host_based::detail::fft::next_power_2(2 * size - 1);
-
- viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
- viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
- viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
-
- std::vector<std::complex<NumericT> > input_complex(size);
- std::vector<std::complex<NumericT> > output_complex(size);
-
- std::vector<std::complex<NumericT> > A_complex(ext_size);
- std::vector<std::complex<NumericT> > B_complex(ext_size);
- std::vector<std::complex<NumericT> > Z_complex(ext_size);
-
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size);
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for
-#endif
- for (long i2 = 0; i2 < long(ext_size); i2++)
- {
- vcl_size_t i = vcl_size_t(i2);
- A_complex[i] = 0;
- B_complex[i] = 0;
- }
-
- vcl_size_t double_size = size << 1;
-
- NumericT const NUM_PI = NumericT(3.14159265358979323846);
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for
-#endif
- for (long i2 = 0; i2 < long(size); i2++)
- {
- vcl_size_t i = vcl_size_t(i2);
- vcl_size_t rm = i * i % (double_size);
- NumericT angle = NumericT(rm) / NumericT(size) * NumericT(NUM_PI);
-
- NumericT sn_a = std::sin(-angle);
- NumericT cs_a = std::cos(-angle);
-
- std::complex<NumericT> a_i(cs_a, sn_a);
- std::complex<NumericT> b_i(cs_a, -sn_a);
-
- A_complex[i] = std::complex<NumericT>(input_complex[i].real() * a_i.real() - input_complex[i].imag() * a_i.imag(),
- input_complex[i].real() * a_i.imag() + input_complex[i].imag() * a_i.real());
- B_complex[i] = b_i;
-
- // very bad instruction, to be fixed
- if (i)
- B_complex[ext_size - i] = b_i;
- }
-
- viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size);
- viennacl::linalg::host_based::detail::fft::copy_to_vector(&A_complex[0], A, ext_size);
- viennacl::linalg::host_based::detail::fft::copy_to_vector(&B_complex[0], B, ext_size);
-
- viennacl::linalg::convolve_i(A, B, Z);
-
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&Z_complex[0], Z, ext_size);
-
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for
-#endif
- for (long i2 = 0; i2 < long(size); i2++)
- {
- vcl_size_t i = vcl_size_t(i2);
- vcl_size_t rm = i * i % (double_size);
- NumericT angle = NumericT(rm) / NumericT(size) * NumericT(-NUM_PI);
- NumericT sn_a = std::sin(angle);
- NumericT cs_a = std::cos(angle);
- std::complex<NumericT> b_i(cs_a, sn_a);
- output_complex[i] = std::complex<NumericT>(Z_complex[i].real() * b_i.real() - Z_complex[i].imag() * b_i.imag(),
- Z_complex[i].real() * b_i.imag() + Z_complex[i].imag() * b_i.real());
- }
- viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], out, size);
-
-}
-
-/**
- * @brief Normalize vector with his own size
- */
-template<typename NumericT, unsigned int AlignmentV>
-void normalize(viennacl::vector<NumericT, AlignmentV> & input)
-{
- vcl_size_t size = input.size() >> 1;
- NumericT norm_factor = static_cast<NumericT>(size);
- for (vcl_size_t i = 0; i < size * 2; i++)
- input[i] /= norm_factor;
-
-}
-
-/**
- * @brief Complex multiplikation of two vectors
- */
-template<typename NumericT, unsigned int AlignmentV>
-void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
- viennacl::vector<NumericT, AlignmentV> const & input2,
- viennacl::vector<NumericT, AlignmentV> & output)
-{
- vcl_size_t size = input1.size() >> 1;
-
- std::vector<std::complex<NumericT> > input1_complex(size);
- std::vector<std::complex<NumericT> > input2_complex(size);
- std::vector<std::complex<NumericT> > output_complex(size);
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input1_complex[0], input1, size);
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input2_complex[0], input2, size);
-
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for
-#endif
- for (long i2 = 0; i2 < long(size); i2++)
- {
- vcl_size_t i = vcl_size_t(i2);
- std::complex<NumericT> in1 = input1_complex[i];
- std::complex<NumericT> in2 = input2_complex[i];
- output_complex[i] = std::complex<NumericT>(in1.real() * in2.real() - in1.imag() * in2.imag(),
- in1.real() * in2.imag() + in1.imag() * in2.real());
- }
- viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], output, size);
-
-}
-/**
- * @brief Inplace transpose of matrix
- */
-template<typename NumericT, unsigned int AlignmentV>
-void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
-{
- vcl_size_t row_num = input.internal_size1() / 2;
- vcl_size_t col_num = input.internal_size2() / 2;
-
- vcl_size_t size = row_num * col_num;
-
- NumericT * data = detail::extract_raw_pointer<NumericT>(input);
-
- std::vector<std::complex<NumericT> > input_complex(size);
-
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size);
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for
-#endif
- for (long i2 = 0; i2 < long(size); i2++)
- {
- vcl_size_t i = vcl_size_t(i2);
- vcl_size_t row = i / col_num;
- vcl_size_t col = i - row * col_num;
- vcl_size_t new_pos = col * row_num + row;
-
- if (i < new_pos)
- {
- std::complex<NumericT> val = input_complex[i];
- input_complex[i] = input_complex[new_pos];
- input_complex[new_pos] = val;
- }
- }
- viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size);
-
-}
-
-/**
- * @brief Transpose matrix
- */
-template<typename NumericT, unsigned int AlignmentV>
-void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
- viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output)
-{
-
- vcl_size_t row_num = input.internal_size1() / 2;
- vcl_size_t col_num = input.internal_size2() / 2;
- vcl_size_t size = row_num * col_num;
-
- NumericT const * data_A = detail::extract_raw_pointer<NumericT>(input);
- NumericT * data_B = detail::extract_raw_pointer<NumericT>(output);
-
- std::vector<std::complex<NumericT> > input_complex(size);
- viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size);
-
- std::vector<std::complex<NumericT> > output_complex(size);
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for
-#endif
- for (long i2 = 0; i2 < long(size); i2++)
- {
- vcl_size_t i = vcl_size_t(i2);
- vcl_size_t row = i / col_num;
- vcl_size_t col = i % col_num;
- vcl_size_t new_pos = col * row_num + row;
- output_complex[new_pos] = input_complex[i];
- }
- viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], data_B, size);
-}
-
-/**
- * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
- */
-template<typename NumericT>
-void real_to_complex(viennacl::vector_base<NumericT> const & in,
- viennacl::vector_base<NumericT> & out, vcl_size_t size)
-{
- NumericT const * data_in = detail::extract_raw_pointer<NumericT>(in);
- NumericT * data_out = detail::extract_raw_pointer<NumericT>(out);
-
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
- for (long i2 = 0; i2 < long(size); i2++)
- {
- vcl_size_t i = static_cast<vcl_size_t>(i2);
- data_out[2*i ] = data_in[i];
- data_out[2*i+1] = NumericT(0);
- }
-}
-
-/**
- * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
- */
-template<typename NumericT>
-void complex_to_real(viennacl::vector_base<NumericT> const & in,
- viennacl::vector_base<NumericT> & out, vcl_size_t size)
-{
- NumericT const * data_in = detail::extract_raw_pointer<NumericT>(in);
- NumericT * data_out = detail::extract_raw_pointer<NumericT>(out);
-
-#ifdef VIENNACL_WITH_OPENMP
-#pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
- for (long i = 0; i < long(size); i++)
- data_out[i] = data_in[2*i];
-}
-
-/**
- * @brief Reverse vector to opposite order and save it in input vector
- */
-template<typename NumericT>
-void reverse(viennacl::vector_base<NumericT> & in)
-{
- vcl_size_t size = in.size();
-
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
- for (long i2 = 0; i2 < long(size); i2++)
- {
- vcl_size_t i = vcl_size_t(i2);
- NumericT val1 = in[i];
- NumericT val2 = in[size - i - 1];
- in[i] = val2;
- in[size - i - 1] = val1;
- }
-}
-
-} //namespace host_based
-} //namespace linalg
-} //namespace viennacl
-
-#endif /* FFT_OPERATIONS_HPP_ */
http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
deleted file mode 100644
index 62c885a..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
+++ /dev/null
@@ -1,672 +0,0 @@
-#ifndef VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_
-#define VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_
-
-/* =========================================================================
- Copyright (c) 2010-2016, Institute for Microelectronics,
- Institute for Analysis and Scientific Computing,
- TU Wien.
- Portions of this software are copyright by UChicago Argonne, LLC.
-
- -----------------
- ViennaCL - The Vienna Computing Library
- -----------------
-
- Project Head: Karl Rupp rupp@iue.tuwien.ac.at
-
- (A list of authors and contributors can be found in the PDF manual)
-
- License: MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/linalg/host_based/ilu_operations.hpp
- @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using the host (OpenMP)
-*/
-
-#include <cmath>
-#include <algorithm> //for std::max and std::min
-
-#include "viennacl/forwards.h"
-#include "viennacl/scalar.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/meta/predicate.hpp"
-#include "viennacl/meta/enable_if.hpp"
-#include "viennacl/traits/size.hpp"
-#include "viennacl/traits/start.hpp"
-#include "viennacl/linalg/host_based/common.hpp"
-#include "viennacl/linalg/vector_operations.hpp"
-#include "viennacl/traits/stride.hpp"
-
-
-// Minimum vector size for using OpenMP on vector operations:
-#ifndef VIENNACL_OPENMP_ILU_MIN_SIZE
- #define VIENNACL_OPENMP_ILU_MIN_SIZE 5000
-#endif
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace host_based
-{
-
-template<typename NumericT>
-void extract_L(compressed_matrix<NumericT> const & A,
- compressed_matrix<NumericT> & L)
-{
- // L is known to have correct dimensions
-
- unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
- unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
- NumericT const *A_elements = detail::extract_raw_pointer<NumericT>(A.handle());
-
- unsigned int *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
-
- //
- // Step 1: Count elements in L
- //
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long row = 0; row < static_cast<long>(A.size1()); ++row)
- {
- unsigned int col_begin = A_row_buffer[row];
- unsigned int col_end = A_row_buffer[row+1];
-
- for (unsigned int j = col_begin; j < col_end; ++j)
- {
- unsigned int col = A_col_buffer[j];
- if (long(col) <= row)
- ++L_row_buffer[row];
- }
- }
-
- //
- // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices
- //
- viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1);
- viennacl::linalg::exclusive_scan(wrapped_L_row_buffer);
- L.reserve(wrapped_L_row_buffer[L.size1()], false);
-
- unsigned int *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
- NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle());
-
- //
- // Step 3: Write entries:
- //
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long row = 0; row < static_cast<long>(A.size1()); ++row)
- {
- unsigned int col_begin = A_row_buffer[row];
- unsigned int col_end = A_row_buffer[row+1];
-
- unsigned int index_L = L_row_buffer[row];
- for (unsigned int j = col_begin; j < col_end; ++j)
- {
- unsigned int col = A_col_buffer[j];
- NumericT value = A_elements[j];
-
- if (long(col) <= row)
- {
- L_col_buffer[index_L] = col;
- L_elements[index_L] = value;
- ++index_L;
- }
- }
- }
-
-} // extract_L
-
-
-/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
-template<typename NumericT>
-void icc_scale(compressed_matrix<NumericT> const & A,
- compressed_matrix<NumericT> & L)
-{
- viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
-
- unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
- unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
- NumericT const *A_elements = detail::extract_raw_pointer<NumericT>(A.handle());
-
- NumericT *D_elements = detail::extract_raw_pointer<NumericT>(D.handle());
-
- //
- // Step 1: Determine D
- //
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long row = 0; row < static_cast<long>(A.size1()); ++row)
- {
- unsigned int col_begin = A_row_buffer[row];
- unsigned int col_end = A_row_buffer[row+1];
-
- for (unsigned int j = col_begin; j < col_end; ++j)
- {
- unsigned int col = A_col_buffer[j];
- if (row == col)
- {
- D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j]));
- break;
- }
- }
- }
-
- //
- // Step 2: Scale values in L:
- //
- unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
- unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
- NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle());
-
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long row = 0; row < static_cast<long>(A.size1()); ++row)
- {
- unsigned int col_begin = L_row_buffer[row];
- unsigned int col_end = L_row_buffer[row+1];
-
- NumericT D_row = D_elements[row];
-
- for (unsigned int j = col_begin; j < col_end; ++j)
- L_elements[j] *= D_row * D_elements[L_col_buffer[j]];
- }
-
- L.generate_row_block_information();
-}
-
-
-
-/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ICC using OpenMP (cf. Algorithm 3 in paper, but for L rather than U) */
-template<typename NumericT>
-void icc_chow_patel_sweep(compressed_matrix<NumericT> & L,
- vector<NumericT> & aij_L)
-{
- unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
- unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
- NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle());
-
- NumericT *aij_ptr = detail::extract_raw_pointer<NumericT>(aij_L.handle());
-
- // temporary workspace
- NumericT *L_backup = (NumericT *)malloc(sizeof(NumericT) * L.nnz());
-
- // backup:
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long i = 0; i < static_cast<long>(L.nnz()); ++i)
- L_backup[i] = L_elements[i];
-
-
- // sweep
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long row = 0; row < static_cast<long>(L.size1()); ++row)
- {
- //
- // update L:
- //
- unsigned int row_Li_start = L_row_buffer[row];
- unsigned int row_Li_end = L_row_buffer[row + 1];
-
- for (unsigned int i = row_Li_start; i < row_Li_end; ++i)
- {
- unsigned int col = L_col_buffer[i];
-
- unsigned int row_Lj_start = L_row_buffer[col];
- unsigned int row_Lj_end = L_row_buffer[col+1];
-
- // compute \sum_{k=1}^{j-1} l_ik l_jk
- unsigned int index_Lj = row_Lj_start;
- unsigned int col_Lj = L_col_buffer[index_Lj];
- NumericT s = aij_ptr[i];
- for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li)
- {
- unsigned int col_Li = L_col_buffer[index_Li];
-
- // find element in row j
- while (col_Lj < col_Li)
- {
- ++index_Lj;
- col_Lj = L_col_buffer[index_Lj];
- }
-
- if (col_Lj == col_Li)
- s -= L_backup[index_Li] * L_backup[index_Lj];
- }
-
- if (row != col)
- L_elements[i] = s / L_backup[row_Lj_end - 1]; // diagonal element is last in row!
- else
- L_elements[i] = std::sqrt(s);
- }
- }
-
- free(L_backup);
-}
-
-
-
-//////////////////////// ILU ////////////////////////
-
-template<typename NumericT>
-void extract_LU(compressed_matrix<NumericT> const & A,
- compressed_matrix<NumericT> & L,
- compressed_matrix<NumericT> & U)
-{
- // L and U are known to have correct dimensions
-
- unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
- unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
- NumericT const *A_elements = detail::extract_raw_pointer<NumericT>(A.handle());
-
- unsigned int *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
- unsigned int *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
-
- //
- // Step 1: Count elements in L and U
- //
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long row = 0; row < static_cast<long>(A.size1()); ++row)
- {
- unsigned int col_begin = A_row_buffer[row];
- unsigned int col_end = A_row_buffer[row+1];
-
- for (unsigned int j = col_begin; j < col_end; ++j)
- {
- unsigned int col = A_col_buffer[j];
- if (long(col) <= row)
- ++L_row_buffer[row];
- if (long(col) >= row)
- ++U_row_buffer[row];
- }
- }
-
- //
- // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices
- //
- viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1);
- viennacl::linalg::exclusive_scan(wrapped_L_row_buffer);
- L.reserve(wrapped_L_row_buffer[L.size1()], false);
-
- viennacl::vector_base<unsigned int> wrapped_U_row_buffer(U.handle1(), U.size1() + 1, 0, 1);
- viennacl::linalg::exclusive_scan(wrapped_U_row_buffer);
- U.reserve(wrapped_U_row_buffer[U.size1()], false);
-
- unsigned int *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
- NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle());
-
- unsigned int *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
- NumericT *U_elements = detail::extract_raw_pointer<NumericT>(U.handle());
-
- //
- // Step 3: Write entries:
- //
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long row = 0; row < static_cast<long>(A.size1()); ++row)
- {
- unsigned int col_begin = A_row_buffer[row];
- unsigned int col_end = A_row_buffer[row+1];
-
- unsigned int index_L = L_row_buffer[row];
- unsigned int index_U = U_row_buffer[row];
- for (unsigned int j = col_begin; j < col_end; ++j)
- {
- unsigned int col = A_col_buffer[j];
- NumericT value = A_elements[j];
-
- if (long(col) <= row)
- {
- L_col_buffer[index_L] = col;
- L_elements[index_L] = value;
- ++index_L;
- }
-
- if (long(col) >= row)
- {
- U_col_buffer[index_U] = col;
- U_elements[index_U] = value;
- ++index_U;
- }
- }
- }
-
-} // extract_LU
-
-
-
-/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
-template<typename NumericT>
-void ilu_scale(compressed_matrix<NumericT> const & A,
- compressed_matrix<NumericT> & L,
- compressed_matrix<NumericT> & U)
-{
- viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
-
- unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
- unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
- NumericT const *A_elements = detail::extract_raw_pointer<NumericT>(A.handle());
-
- NumericT *D_elements = detail::extract_raw_pointer<NumericT>(D.handle());
-
- //
- // Step 1: Determine D
- //
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long row = 0; row < static_cast<long>(A.size1()); ++row)
- {
- unsigned int col_begin = A_row_buffer[row];
- unsigned int col_end = A_row_buffer[row+1];
-
- for (unsigned int j = col_begin; j < col_end; ++j)
- {
- unsigned int col = A_col_buffer[j];
- if (row == col)
- {
- D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j]));
- break;
- }
- }
- }
-
- //
- // Step 2: Scale values in L:
- //
- unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
- unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
- NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle());
-
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long row = 0; row < static_cast<long>(A.size1()); ++row)
- {
- unsigned int col_begin = L_row_buffer[row];
- unsigned int col_end = L_row_buffer[row+1];
-
- NumericT D_row = D_elements[row];
-
- for (unsigned int j = col_begin; j < col_end; ++j)
- L_elements[j] *= D_row * D_elements[L_col_buffer[j]];
- }
-
- //
- // Step 3: Scale values in U:
- //
- unsigned int const *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
- unsigned int const *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
- NumericT *U_elements = detail::extract_raw_pointer<NumericT>(U.handle());
-
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long row = 0; row < static_cast<long>(A.size1()); ++row)
- {
- unsigned int col_begin = U_row_buffer[row];
- unsigned int col_end = U_row_buffer[row+1];
-
- NumericT D_row = D_elements[row];
-
- for (unsigned int j = col_begin; j < col_end; ++j)
- U_elements[j] *= D_row * D_elements[U_col_buffer[j]];
- }
-
- L.generate_row_block_information();
- // Note: block information for U will be generated after transposition
-
-}
-
-template<typename NumericT>
-void ilu_transpose(compressed_matrix<NumericT> const & A,
- compressed_matrix<NumericT> & B)
-{
- NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
- unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
- unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
-
- // initialize datastructures for B:
- B = compressed_matrix<NumericT>(A.size2(), A.size1(), A.nnz(), viennacl::traits::context(A));
-
- NumericT * B_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(B.handle());
- unsigned int * B_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle1());
- unsigned int * B_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle2());
-
- // prepare uninitialized B_row_buffer:
- for (std::size_t i = 0; i < B.size1(); ++i)
- B_row_buffer[i] = 0;
-
- //
- // Stage 1: Compute pattern for B
- //
- for (std::size_t row = 0; row < A.size1(); ++row)
- {
- unsigned int row_start = A_row_buffer[row];
- unsigned int row_stop = A_row_buffer[row+1];
-
- for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
- B_row_buffer[A_col_buffer[nnz_index]] += 1;
- }
-
- // Bring row-start array in place using exclusive-scan:
- unsigned int offset = B_row_buffer[0];
- B_row_buffer[0] = 0;
- for (std::size_t row = 1; row < B.size1(); ++row)
- {
- unsigned int tmp = B_row_buffer[row];
- B_row_buffer[row] = offset;
- offset += tmp;
- }
- B_row_buffer[B.size1()] = offset;
-
- //
- // Stage 2: Fill with data
- //
-
- std::vector<unsigned int> B_row_offsets(B.size1()); //number of elements already written per row
-
- for (unsigned int row = 0; row < static_cast<unsigned int>(A.size1()); ++row)
- {
- //std::cout << "Row " << row << ": ";
- unsigned int row_start = A_row_buffer[row];
- unsigned int row_stop = A_row_buffer[row+1];
-
- for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
- {
- unsigned int col_in_A = A_col_buffer[nnz_index];
- unsigned int B_nnz_index = B_row_buffer[col_in_A] + B_row_offsets[col_in_A];
- B_col_buffer[B_nnz_index] = row;
- B_elements[B_nnz_index] = A_elements[nnz_index];
- ++B_row_offsets[col_in_A];
- //B_temp.at(A_col_buffer[nnz_index])[row] = A_elements[nnz_index];
- }
- }
-
- // Step 3: Make datastructure consistent (row blocks!)
- B.generate_row_block_information();
-}
-
-
-
-/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */
-template<typename NumericT>
-void ilu_chow_patel_sweep(compressed_matrix<NumericT> & L,
- vector<NumericT> const & aij_L,
- compressed_matrix<NumericT> & U_trans,
- vector<NumericT> const & aij_U_trans)
-{
- unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
- unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
- NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle());
-
- NumericT const *aij_L_ptr = detail::extract_raw_pointer<NumericT>(aij_L.handle());
-
- unsigned int const *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U_trans.handle1());
- unsigned int const *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U_trans.handle2());
- NumericT *U_elements = detail::extract_raw_pointer<NumericT>(U_trans.handle());
-
- NumericT const *aij_U_trans_ptr = detail::extract_raw_pointer<NumericT>(aij_U_trans.handle());
-
- // temporary workspace
- NumericT *L_backup = new NumericT[L.nnz()];
- NumericT *U_backup = new NumericT[U_trans.nnz()];
-
- // backup:
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long i = 0; i < static_cast<long>(L.nnz()); ++i)
- L_backup[i] = L_elements[i];
-
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (U_trans.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long i = 0; i < static_cast<long>(U_trans.nnz()); ++i)
- U_backup[i] = U_elements[i];
-
- // sweep
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long row = 0; row < static_cast<long>(L.size1()); ++row)
- {
- //
- // update L:
- //
- unsigned int row_L_start = L_row_buffer[row];
- unsigned int row_L_end = L_row_buffer[row + 1];
-
- for (unsigned int j = row_L_start; j < row_L_end; ++j)
- {
- unsigned int col = L_col_buffer[j];
-
- if (col == row)
- continue;
-
- unsigned int row_U_start = U_row_buffer[col];
- unsigned int row_U_end = U_row_buffer[col + 1];
-
- // compute \sum_{k=1}^{j-1} l_ik u_kj
- unsigned int index_U = row_U_start;
- unsigned int col_U = (index_U < row_U_end) ? U_col_buffer[index_U] : static_cast<unsigned int>(U_trans.size2());
- NumericT sum = 0;
- for (unsigned int k = row_L_start; k < j; ++k)
- {
- unsigned int col_L = L_col_buffer[k];
-
- // find element in U
- while (col_U < col_L)
- {
- ++index_U;
- col_U = U_col_buffer[index_U];
- }
-
- if (col_U == col_L)
- sum += L_backup[k] * U_backup[index_U];
- }
-
- // update l_ij:
- assert(U_col_buffer[row_U_end - 1] == col && bool("Accessing U element which is not a diagonal element!"));
- L_elements[j] = (aij_L_ptr[j] - sum) / U_backup[row_U_end - 1]; // diagonal element is last entry in U
- }
-
-
- //
- // update U:
- //
- unsigned int row_U_start = U_row_buffer[row];
- unsigned int row_U_end = U_row_buffer[row + 1];
- for (unsigned int j = row_U_start; j < row_U_end; ++j)
- {
- unsigned int col = U_col_buffer[j];
-
- row_L_start = L_row_buffer[col];
- row_L_end = L_row_buffer[col + 1];
-
- // compute \sum_{k=1}^{j-1} l_ik u_kj
- unsigned int index_L = row_L_start;
- unsigned int col_L = (index_L < row_L_end) ? L_col_buffer[index_L] : static_cast<unsigned int>(L.size1());
- NumericT sum = 0;
- for (unsigned int k = row_U_start; k < j; ++k)
- {
- unsigned int col_U = U_col_buffer[k];
-
- // find element in L
- while (col_L < col_U)
- {
- ++index_L;
- col_L = L_col_buffer[index_L];
- }
-
- if (col_U == col_L)
- sum += L_backup[index_L] * U_backup[k];
- }
-
- // update u_ij:
- U_elements[j] = aij_U_trans_ptr[j] - sum;
- }
- }
-
- delete[] L_backup;
- delete[] U_backup;
-}
-
-
-template<typename NumericT>
-void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
- vector<NumericT> & diag_R)
-{
- unsigned int *R_row_buffer = detail::extract_raw_pointer<unsigned int>(R.handle1());
- unsigned int *R_col_buffer = detail::extract_raw_pointer<unsigned int>(R.handle2());
- NumericT *R_elements = detail::extract_raw_pointer<NumericT>(R.handle());
-
- NumericT *diag_R_ptr = detail::extract_raw_pointer<NumericT>(diag_R.handle());
-
-#ifdef VIENNACL_WITH_OPENMP
- #pragma omp parallel for if (R.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
-#endif
- for (long row = 0; row < static_cast<long>(R.size1()); ++row)
- {
- unsigned int col_begin = R_row_buffer[row];
- unsigned int col_end = R_row_buffer[row+1];
-
- // part 1: extract diagonal entry
- NumericT diag = 0;
- for (unsigned int j = col_begin; j < col_end; ++j)
- {
- unsigned int col = R_col_buffer[j];
- if (col == row)
- {
- diag = R_elements[j];
- R_elements[j] = 0; // (I - D^{-1}R)
- break;
- }
- }
- diag_R_ptr[row] = diag;
-
- assert((diag > 0 || diag < 0) && bool("Zero diagonal detected!"));
-
- // part2: scale
- for (unsigned int j = col_begin; j < col_end; ++j)
- R_elements[j] /= -diag;
- }
-
- //std::cout << "diag_R: " << diag_R << std::endl;
-}
-
-} //namespace host_based
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif