You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:37 UTC

[40/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/amg.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/amg.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/amg.hpp
new file mode 100644
index 0000000..0b81203
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/amg.hpp
@@ -0,0 +1,398 @@
+#ifndef VIENNACL_LINALG_AMG_HPP_
+#define VIENNACL_LINALG_AMG_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/amg.hpp
+    @brief Main include file for algebraic multigrid (AMG) preconditioners.  Experimental.
+
+    Implementation contributed by Markus Wagner
+*/
+
+#include <vector>
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/amg_operations.hpp"
+#include "viennacl/tools/timer.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/lu.hpp"
+
+#include <map>
+
+#ifdef VIENNACL_WITH_OPENMP
+ #include <omp.h>
+#endif
+
+#define VIENNACL_AMG_MAX_LEVELS 20
+
+namespace viennacl
+{
+namespace linalg
+{
+
+class amg_coarse_problem_too_large_exception : public std::runtime_error
+{
+public:
+  amg_coarse_problem_too_large_exception(std::string const & msg, vcl_size_t num_points) : std::runtime_error(msg), c_points_(num_points) {}
+
+  /** @brief Returns the number of coarse points for which no further coarsening could be applied */
+  vcl_size_t coarse_points() const { return c_points_; }
+
+private:
+  vcl_size_t c_points_;
+};
+
+
+namespace detail
+{
+  /** @brief Sparse Galerkin product: Calculates A_coarse = trans(P)*A_fine*P = R*A_fine*P
+    *
+    * @param A_fine    Operator matrix on fine grid (quadratic)
+    * @param P         Prolongation/Interpolation matrix
+    * @param R         Restriction matrix
+    * @param A_coarse  Result matrix on coarse grid (Galerkin operator)
+    */
+  template<typename NumericT>
+  void amg_galerkin_prod(compressed_matrix<NumericT> & A_fine,
+                         compressed_matrix<NumericT> & P,
+                         compressed_matrix<NumericT> & R, //P^T
+                         compressed_matrix<NumericT> & A_coarse)
+  {
+
+    compressed_matrix<NumericT> A_fine_times_P(viennacl::traits::context(A_fine));
+
+    // transpose P in memory (no known way of efficiently multiplying P^T * B for CSR-matrices P and B):
+    viennacl::linalg::detail::amg::amg_transpose(P, R);
+
+    // compute Galerkin product using a temporary for the result of A_fine * P
+    A_fine_times_P = viennacl::linalg::prod(A_fine, P);
+    A_coarse = viennacl::linalg::prod(R, A_fine_times_P);
+
+  }
+
+
+  /** @brief Setup AMG preconditioner
+  *
+  * @param list_of_A                  Operator matrices on all levels
+  * @param list_of_P                  Prolongation/Interpolation operators on all levels
+  * @param list_of_R                  Restriction operators on all levels
+  * @param list_of_amg_level_context  Auxiliary datastructures for managing the grid hierarchy (coarse nodes, etc.)
+  * @param tag                        AMG preconditioner tag
+  */
+  template<typename NumericT, typename AMGContextListT>
+  vcl_size_t amg_setup(std::vector<compressed_matrix<NumericT> > & list_of_A,
+                       std::vector<compressed_matrix<NumericT> > & list_of_P,
+                       std::vector<compressed_matrix<NumericT> > & list_of_R,
+                       AMGContextListT & list_of_amg_level_context,
+                       amg_tag & tag)
+  {
+    // Set number of iterations. If automatic coarse grid construction is chosen (0), then set a maximum size and stop during the process.
+    vcl_size_t iterations = tag.get_coarse_levels();
+    if (iterations == 0)
+      iterations = VIENNACL_AMG_MAX_LEVELS;
+
+    for (vcl_size_t i=0; i<iterations; ++i)
+    {
+      list_of_amg_level_context[i].switch_context(tag.get_setup_context());
+      list_of_amg_level_context[i].resize(list_of_A[i].size1(), list_of_A[i].nnz());
+
+      // Construct C and F points on coarse level (i is fine level, i+1 coarse level).
+      detail::amg::amg_coarse(list_of_A[i], list_of_amg_level_context[i], tag);
+
+      // Calculate number of C and F points on level i.
+      unsigned int c_points = list_of_amg_level_context[i].num_coarse_;
+      unsigned int f_points = static_cast<unsigned int>(list_of_A[i].size1()) - c_points;
+
+      if (f_points == 0 && c_points > tag.get_coarsening_cutoff())
+      {
+        std::stringstream ss;
+        ss << "No further coarsening possible (" << c_points << " coarse points). Consider changing the strong connection threshold or increasing the coarsening cutoff." << std::endl;
+        throw amg_coarse_problem_too_large_exception(ss.str(), c_points);
+      }
+
+      // Stop routine when the maximal coarse level is found (no C or F point). Coarsest level is level i.
+      if (c_points == 0 || f_points == 0)
+        break;
+
+      // Construct interpolation matrix for level i.
+      detail::amg::amg_interpol(list_of_A[i], list_of_P[i], list_of_amg_level_context[i], tag);
+
+      // Compute coarse grid operator (A[i+1] = R * A[i] * P) with R = trans(P).
+      amg_galerkin_prod(list_of_A[i], list_of_P[i], list_of_R[i], list_of_A[i+1]);
+
+      // send matrices to target context:
+      list_of_A[i].switch_memory_context(tag.get_target_context());
+      list_of_P[i].switch_memory_context(tag.get_target_context());
+      list_of_R[i].switch_memory_context(tag.get_target_context());
+
+      // If Limit of coarse points is reached then stop. Coarsest level is level i+1.
+      if (tag.get_coarse_levels() == 0 && c_points <= tag.get_coarsening_cutoff())
+        return i+1;
+    }
+
+    return iterations;
+  }
+
+
+  /** @brief Initialize AMG preconditioner
+  *
+  * @param mat                        System matrix
+  * @param list_of_A                  Operator matrices on all levels
+  * @param list_of_P                  Prolongation/Interpolation operators on all levels
+  * @param list_of_R                  Restriction operators on all levels
+  * @param list_of_amg_level_context  Auxiliary datastructures for managing the grid hierarchy (coarse nodes, etc.)
+  * @param tag                        AMG preconditioner tag
+  */
+  template<typename MatrixT, typename InternalT1, typename InternalT2>
+  void amg_init(MatrixT const & mat, InternalT1 & list_of_A, InternalT1 & list_of_P, InternalT1 & list_of_R, InternalT2 & list_of_amg_level_context, amg_tag & tag)
+  {
+    typedef typename InternalT1::value_type SparseMatrixType;
+
+    vcl_size_t num_levels = (tag.get_coarse_levels() > 0) ? tag.get_coarse_levels() : VIENNACL_AMG_MAX_LEVELS;
+
+    list_of_A.resize(num_levels+1, SparseMatrixType(tag.get_setup_context()));
+    list_of_P.resize(num_levels,   SparseMatrixType(tag.get_setup_context()));
+    list_of_R.resize(num_levels,   SparseMatrixType(tag.get_setup_context()));
+    list_of_amg_level_context.resize(num_levels);
+
+    // Insert operator matrix as operator for finest level.
+    //SparseMatrixType A0(mat);
+    //A.insert_element(0, A0);
+    list_of_A[0].switch_memory_context(viennacl::traits::context(mat));
+    list_of_A[0] = mat;
+    list_of_A[0].switch_memory_context(tag.get_setup_context());
+  }
+
+  /** @brief Setup data structures for precondition phase for later use on the GPU
+  *
+  * @param result          Result vector on all levels
+  * @param result_backup   Copy of result vector on all levels
+  * @param rhs             RHS vector on all levels
+  * @param residual        Residual vector on all levels
+  * @param A               Operators matrices on all levels from setup phase
+  * @param coarse_levels   Number of coarse levels for which the datastructures should be set up.
+  * @param tag             AMG preconditioner tag
+  */
+  template<typename InternalVectorT, typename SparseMatrixT>
+  void amg_setup_apply(InternalVectorT & result,
+                       InternalVectorT & result_backup,
+                       InternalVectorT & rhs,
+                       InternalVectorT & residual,
+                       SparseMatrixT const & A,
+                       vcl_size_t coarse_levels,
+                       amg_tag const & tag)
+  {
+    typedef typename InternalVectorT::value_type VectorType;
+
+    result.resize(coarse_levels + 1);
+    result_backup.resize(coarse_levels + 1);
+    rhs.resize(coarse_levels + 1);
+    residual.resize(coarse_levels);
+
+    for (vcl_size_t level=0; level <= coarse_levels; ++level)
+    {
+             result[level] = VectorType(A[level].size1(), tag.get_target_context());
+      result_backup[level] = VectorType(A[level].size1(), tag.get_target_context());
+                rhs[level] = VectorType(A[level].size1(), tag.get_target_context());
+    }
+    for (vcl_size_t level=0; level < coarse_levels; ++level)
+    {
+      residual[level] = VectorType(A[level].size1(), tag.get_target_context());
+    }
+  }
+
+
+  /** @brief Pre-compute LU factorization for direct solve (ublas library).
+  *
+  * Speeds up precondition phase as this is computed only once overall instead of once per iteration.
+  *
+  * @param op           Operator matrix for direct solve
+  * @param A            Operator matrix on coarsest level
+  * @param tag          AMG preconditioner tag
+  */
+  template<typename NumericT, typename SparseMatrixT>
+  void amg_lu(viennacl::matrix<NumericT> & op,
+              SparseMatrixT const & A,
+              amg_tag const & tag)
+  {
+    op.switch_memory_context(tag.get_setup_context());
+    op.resize(A.size1(), A.size2(), false);
+    viennacl::linalg::detail::amg::assign_to_dense(A, op);
+
+    viennacl::linalg::lu_factorize(op);
+    op.switch_memory_context(tag.get_target_context());
+  }
+
+}
+
+/** @brief AMG preconditioner class, can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class amg_precond;
+
+
+/** @brief AMG preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class amg_precond< compressed_matrix<NumericT, AlignmentV> >
+{
+  typedef viennacl::compressed_matrix<NumericT, AlignmentV> SparseMatrixType;
+  typedef viennacl::vector<NumericT>                        VectorType;
+  typedef detail::amg::amg_level_context                    AMGContextType;
+
+public:
+
+  amg_precond() {}
+
+  /** @brief The constructor. Builds data structures.
+  *
+  * @param mat  System matrix
+  * @param tag  The AMG tag
+  */
+  amg_precond(compressed_matrix<NumericT, AlignmentV> const & mat,
+              amg_tag const & tag)
+  {
+    tag_ = tag;
+
+    // Initialize data structures.
+    detail::amg_init(mat, A_list_, P_list_, R_list_, amg_context_list_, tag_);
+  }
+
+  /** @brief Start setup phase for this class and copy data structures.
+  */
+  void setup()
+  {
+    // Start setup phase.
+    vcl_size_t num_coarse_levels = detail::amg_setup(A_list_, P_list_, R_list_, amg_context_list_, tag_);
+
+    // Setup precondition phase (Data structures).
+    detail::amg_setup_apply(result_list_, result_backup_list_, rhs_list_, residual_list_, A_list_, num_coarse_levels, tag_);
+
+    // LU factorization for direct solve.
+    detail::amg_lu(coarsest_op_, A_list_[num_coarse_levels], tag_);
+  }
+
+
+  /** @brief Precondition Operation
+  *
+  * @param vec       The vector to which preconditioning is applied to
+  */
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    vcl_size_t level;
+
+    // Precondition operation (Yang, p.3).
+    rhs_list_[0] = vec;
+
+    // Part 1: Restrict down to coarsest level
+    for (level=0; level < residual_list_.size(); level++)
+    {
+      result_list_[level].clear();
+
+      // Apply Smoother presmooth_ times.
+      viennacl::linalg::detail::amg::smooth_jacobi(static_cast<unsigned int>(tag_.get_presmooth_steps()),
+                                                   A_list_[level],
+                                                   result_list_[level],
+                                                   result_backup_list_[level],
+                                                   rhs_list_[level],
+                                                   static_cast<NumericT>(tag_.get_jacobi_weight()));
+
+      // Compute residual.
+      //residual[level] = rhs_[level] - viennacl::linalg::prod(A_[level], result_[level]);
+      residual_list_[level] = viennacl::linalg::prod(A_list_[level], result_list_[level]);
+      residual_list_[level] = rhs_list_[level] - residual_list_[level];
+
+      // Restrict to coarse level. Result is RHS of coarse level equation.
+      //residual_coarse[level] = viennacl::linalg::prod(R[level],residual[level]);
+      rhs_list_[level+1] = viennacl::linalg::prod(R_list_[level], residual_list_[level]);
+    }
+
+    // Part 2: On highest level use direct solve to solve equation (on the CPU)
+    result_list_[level] = rhs_list_[level];
+    viennacl::linalg::lu_substitute(coarsest_op_, result_list_[level]);
+
+    // Part 3: Prolongation to finest level
+    for (int level2 = static_cast<int>(residual_list_.size()-1); level2 >= 0; level2--)
+    {
+      level = static_cast<vcl_size_t>(level2);
+
+      // Interpolate error to fine level and correct solution.
+      result_backup_list_[level] = viennacl::linalg::prod(P_list_[level], result_list_[level+1]);
+      result_list_[level] += result_backup_list_[level];
+
+      // Apply Smoother postsmooth_ times.
+      viennacl::linalg::detail::amg::smooth_jacobi(static_cast<unsigned int>(tag_.get_postsmooth_steps()),
+                                                   A_list_[level],
+                                                   result_list_[level],
+                                                   result_backup_list_[level],
+                                                   rhs_list_[level],
+                                                   static_cast<NumericT>(tag_.get_jacobi_weight()));
+    }
+    vec = result_list_[0];
+  }
+
+  /** @brief Returns the total number of multigrid levels in the hierarchy including the finest level. */
+  vcl_size_t levels() const { return residual_list_.size(); }
+
+
+  /** @brief Returns the problem/operator size at the respective multigrid level
+    *
+    * @param level     Index of the multigrid level. 0 is the finest level, levels() - 1 is the coarsest level.
+    */
+  vcl_size_t size(vcl_size_t level) const
+  {
+    assert(level < levels() && bool("Level index out of bounds!"));
+    return residual_list_[level].size();
+  }
+
+  /** @brief Returns the associated preconditioner tag containing the configuration for the multigrid preconditioner. */
+  amg_tag const & tag() const { return tag_; }
+
+private:
+  std::vector<SparseMatrixType> A_list_;
+  std::vector<SparseMatrixType> P_list_;
+  std::vector<SparseMatrixType> R_list_;
+  std::vector<AMGContextType>   amg_context_list_;
+
+  viennacl::matrix<NumericT>        coarsest_op_;
+
+  mutable std::vector<VectorType> result_list_;
+  mutable std::vector<VectorType> result_backup_list_;
+  mutable std::vector<VectorType> rhs_list_;
+  mutable std::vector<VectorType> residual_list_;
+
+  amg_tag tag_;
+};
+
+}
+}
+
+
+
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/amg_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/amg_operations.hpp
new file mode 100644
index 0000000..9c7f79f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/amg_operations.hpp
@@ -0,0 +1,238 @@
+#ifndef VIENNACL_LINALG_AMG_OPERATIONS_HPP_
+#define VIENNACL_LINALG_AMG_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/amg_operations.hpp
+    @brief Implementations of operations for algebraic multigrid
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+#include "viennacl/linalg/host_based/amg_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/amg_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/amg_operations.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace amg
+{
+
+template<typename NumericT, typename AMGContextT>
+void amg_influence(compressed_matrix<NumericT> const & A, AMGContextT & amg_context, amg_tag & tag)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::amg::amg_influence(A, amg_context, tag);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::amg::amg_influence(A, amg_context, tag);
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case viennacl::CUDA_MEMORY:
+      viennacl::linalg::cuda::amg::amg_influence(A, amg_context, tag);
+      break;
+#endif
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+
+template<typename NumericT, typename AMGContextT>
+void amg_coarse(compressed_matrix<NumericT> const & A, AMGContextT & amg_context, amg_tag & tag)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::amg::amg_coarse(A, amg_context, tag);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::amg::amg_coarse(A, amg_context, tag);
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case viennacl::CUDA_MEMORY:
+      viennacl::linalg::cuda::amg::amg_coarse(A, amg_context, tag);
+      break;
+#endif
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+
+template<typename NumericT, typename AMGContextT>
+void amg_interpol(compressed_matrix<NumericT> const & A,
+                  compressed_matrix<NumericT>       & P,
+                  AMGContextT & amg_context,
+                  amg_tag & tag)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::amg::amg_interpol(A, P, amg_context, tag);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::amg::amg_interpol(A, P, amg_context, tag);
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case viennacl::CUDA_MEMORY:
+      viennacl::linalg::cuda::amg::amg_interpol(A, P, amg_context, tag);
+      break;
+#endif
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+
+template<typename NumericT>
+void amg_transpose(compressed_matrix<NumericT> & A,
+                   compressed_matrix<NumericT> & B)
+{
+  viennacl::context orig_ctx = viennacl::traits::context(A);
+  viennacl::context cpu_ctx(viennacl::MAIN_MEMORY);
+  (void)orig_ctx;
+  (void)cpu_ctx;
+
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::amg::amg_transpose(A, B);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case viennacl::OPENCL_MEMORY:
+      A.switch_memory_context(cpu_ctx);
+      B.switch_memory_context(cpu_ctx);
+      viennacl::linalg::host_based::amg::amg_transpose(A, B);
+      A.switch_memory_context(orig_ctx);
+      B.switch_memory_context(orig_ctx);
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case viennacl::CUDA_MEMORY:
+      A.switch_memory_context(cpu_ctx);
+      B.switch_memory_context(cpu_ctx);
+      viennacl::linalg::host_based::amg::amg_transpose(A, B);
+      A.switch_memory_context(orig_ctx);
+      B.switch_memory_context(orig_ctx);
+      //viennacl::linalg::cuda::amg_transpose(A, B);
+      break;
+#endif
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+/** Assign sparse matrix A to dense matrix B */
+template<typename SparseMatrixType, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+assign_to_dense(SparseMatrixType const & A,
+                viennacl::matrix_base<NumericT> & B)
+{
+  assert( (A.size1() == B.size1()) && bool("Size check failed for assignment to dense matrix: size1(A) != size1(B)"));
+  assert( (A.size2() == B.size1()) && bool("Size check failed for assignment to dense matrix: size2(A) != size2(B)"));
+
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::amg::assign_to_dense(A, B);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::amg::assign_to_dense(A, B);
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case viennacl::CUDA_MEMORY:
+      viennacl::linalg::cuda::amg::assign_to_dense(A, B);
+      break;
+#endif
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+template<typename NumericT>
+void smooth_jacobi(unsigned int iterations,
+                   compressed_matrix<NumericT> const & A,
+                   vector<NumericT> & x,
+                   vector<NumericT> & x_backup,
+                   vector<NumericT> const & rhs_smooth,
+                   NumericT weight)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::amg::smooth_jacobi(iterations, A, x, x_backup, rhs_smooth, weight);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::amg::smooth_jacobi(iterations, A, x, x_backup, rhs_smooth, weight);
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case viennacl::CUDA_MEMORY:
+      viennacl::linalg::cuda::amg::smooth_jacobi(iterations, A, x, x_backup, rhs_smooth, weight);
+      break;
+#endif
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+} //namespace amg
+} //namespace detail
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/bicgstab.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/bicgstab.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/bicgstab.hpp
new file mode 100644
index 0000000..57bc89a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/bicgstab.hpp
@@ -0,0 +1,598 @@
+#ifndef VIENNACL_LINALG_BICGSTAB_HPP_
+#define VIENNACL_LINALG_BICGSTAB_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file bicgstab.hpp
+    @brief The stabilized bi-conjugate gradient method is implemented here
+*/
+
+#include <vector>
+#include <cmath>
+#include <numeric>
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/traits/clear.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/context.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/linalg/iterative_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for the stabilized Bi-conjugate gradient solver. Used for supplying solver parameters and for dispatching the solve() function
+*/
+class bicgstab_tag
+{
+public:
+  /** @brief The constructor
+  *
+  * @param tol              Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||)
+  * @param max_iters        The maximum number of iterations
+  * @param max_iters_before_restart   The maximum number of iterations before BiCGStab is reinitialized (to avoid accumulation of round-off errors)
+  */
+  bicgstab_tag(double tol = 1e-8, vcl_size_t max_iters = 400, vcl_size_t max_iters_before_restart = 200)
+    : tol_(tol), abs_tol_(0), iterations_(max_iters), iterations_before_restart_(max_iters_before_restart) {}
+
+  /** @brief Returns the relative tolerance */
+  double tolerance() const { return tol_; }
+
+  /** @brief Returns the absolute tolerance */
+  double abs_tolerance() const { return abs_tol_; }
+  /** @brief Sets the absolute tolerance */
+  void abs_tolerance(double new_tol) { if (new_tol >= 0) abs_tol_ = new_tol; }
+
+  /** @brief Returns the maximum number of iterations */
+  vcl_size_t max_iterations() const { return iterations_; }
+  /** @brief Returns the maximum number of iterations before a restart*/
+  vcl_size_t max_iterations_before_restart() const { return iterations_before_restart_; }
+
+  /** @brief Return the number of solver iterations: */
+  vcl_size_t iters() const { return iters_taken_; }
+  void iters(vcl_size_t i) const { iters_taken_ = i; }
+
+  /** @brief Returns the estimated relative error at the end of the solver run */
+  double error() const { return last_error_; }
+  /** @brief Sets the estimated relative error at the end of the solver run */
+  void error(double e) const { last_error_ = e; }
+
+private:
+  double tol_;
+  double abs_tol_;
+  vcl_size_t iterations_;
+  vcl_size_t iterations_before_restart_;
+
+  //return values from solver
+  mutable vcl_size_t iters_taken_;
+  mutable double last_error_;
+};
+
+
+
+namespace detail
+{
+  /** @brief Implementation of a pipelined stabilized Bi-conjugate gradient solver */
+  template<typename MatrixT, typename NumericT>
+  viennacl::vector<NumericT> pipelined_solve(MatrixT const & A, //MatrixType const & A,
+                                             viennacl::vector_base<NumericT> const & rhs,
+                                             bicgstab_tag const & tag,
+                                             viennacl::linalg::no_precond,
+                                             bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                             void *monitor_data = NULL)
+  {
+    viennacl::vector<NumericT> result = viennacl::zero_vector<NumericT>(rhs.size(), viennacl::traits::context(rhs));
+
+    viennacl::vector<NumericT> residual = rhs;
+    viennacl::vector<NumericT> p = rhs;
+    viennacl::vector<NumericT> r0star = rhs;
+    viennacl::vector<NumericT> Ap = rhs;
+    viennacl::vector<NumericT> s  = rhs;
+    viennacl::vector<NumericT> As = rhs;
+
+    // Layout of temporary buffer:
+    //  chunk 0: <residual, r_0^*>
+    //  chunk 1: <As, As>
+    //  chunk 2: <As, s>
+    //  chunk 3: <Ap, r_0^*>
+    //  chunk 4: <As, r_0^*>
+    //  chunk 5: <s, s>
+    vcl_size_t buffer_size_per_vector = 256;
+    vcl_size_t num_buffer_chunks = 6;
+    viennacl::vector<NumericT> inner_prod_buffer = viennacl::zero_vector<NumericT>(num_buffer_chunks*buffer_size_per_vector, viennacl::traits::context(rhs)); // temporary buffer
+    std::vector<NumericT>      host_inner_prod_buffer(inner_prod_buffer.size());
+
+    NumericT norm_rhs_host = viennacl::linalg::norm_2(residual);
+    NumericT beta;
+    NumericT alpha;
+    NumericT omega;
+    NumericT residual_norm = norm_rhs_host;
+    inner_prod_buffer[0] = norm_rhs_host * norm_rhs_host;
+
+    NumericT  r_dot_r0 = 0;
+    NumericT As_dot_As = 0;
+    NumericT As_dot_s  = 0;
+    NumericT Ap_dot_r0 = 0;
+    NumericT As_dot_r0 = 0;
+    NumericT  s_dot_s  = 0;
+
+    if (norm_rhs_host <= tag.abs_tolerance()) //solution is zero if RHS norm is zero
+      return result;
+
+    for (vcl_size_t i = 0; i < tag.max_iterations(); ++i)
+    {
+      tag.iters(i+1);
+      // Ap = A*p_j
+      // Ap_dot_r0 = <Ap, r_0^*>
+      viennacl::linalg::pipelined_bicgstab_prod(A, p, Ap, r0star,
+                                                inner_prod_buffer, buffer_size_per_vector, 3*buffer_size_per_vector);
+
+      //////// first (weak) synchronization point ////
+
+      ///// method 1: compute alpha on host:
+      //
+      //// we only need the second chunk of the buffer for computing Ap_dot_r0:
+      //viennacl::fast_copy(inner_prod_buffer.begin(), inner_prod_buffer.end(), host_inner_prod_buffer.begin());
+      //Ap_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() +     buffer_size_per_vector, host_inner_prod_buffer.begin() + 2 * buffer_size_per_vector, ScalarType(0));
+
+      //alpha = residual_dot_r0 / Ap_dot_r0;
+
+      //// s_j = r_j - alpha_j q_j
+      //s = residual - alpha * Ap;
+
+      ///// method 2: compute alpha on device:
+      // s = r - alpha * Ap
+      // <s, s> first stage
+      // dump alpha at end of inner_prod_buffer
+      viennacl::linalg::pipelined_bicgstab_update_s(s, residual, Ap,
+                                                    inner_prod_buffer, buffer_size_per_vector, 5*buffer_size_per_vector);
+
+      // As = A*s_j
+      // As_dot_As = <As, As>
+      // As_dot_s  = <As, s>
+      // As_dot_r0 = <As, r_0^*>
+      viennacl::linalg::pipelined_bicgstab_prod(A, s, As, r0star,
+                                                inner_prod_buffer, buffer_size_per_vector, 4*buffer_size_per_vector);
+
+      //////// second (strong) synchronization point ////
+
+      viennacl::fast_copy(inner_prod_buffer.begin(), inner_prod_buffer.end(), host_inner_prod_buffer.begin());
+
+      typedef typename std::vector<NumericT>::difference_type       difference_type;
+
+       r_dot_r0 = std::accumulate(host_inner_prod_buffer.begin(),                                               host_inner_prod_buffer.begin() + difference_type(    buffer_size_per_vector), NumericT(0));
+      As_dot_As = std::accumulate(host_inner_prod_buffer.begin() + difference_type(    buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(2 * buffer_size_per_vector), NumericT(0));
+      As_dot_s  = std::accumulate(host_inner_prod_buffer.begin() + difference_type(2 * buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(3 * buffer_size_per_vector), NumericT(0));
+      Ap_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() + difference_type(3 * buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(4 * buffer_size_per_vector), NumericT(0));
+      As_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() + difference_type(4 * buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(5 * buffer_size_per_vector), NumericT(0));
+       s_dot_s  = std::accumulate(host_inner_prod_buffer.begin() + difference_type(5 * buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(6 * buffer_size_per_vector), NumericT(0));
+
+      alpha =   r_dot_r0 / Ap_dot_r0;
+      beta  = - As_dot_r0 / Ap_dot_r0;
+      omega =   As_dot_s  / As_dot_As;
+
+      residual_norm = std::sqrt(s_dot_s - NumericT(2.0) * omega * As_dot_s + omega * omega *  As_dot_As);
+      if (monitor && monitor(result, std::fabs(residual_norm / norm_rhs_host), monitor_data))
+        break;
+      if (std::fabs(residual_norm / norm_rhs_host) < tag.tolerance() || residual_norm < tag.abs_tolerance())
+        break;
+
+      // x_{j+1} = x_j + alpha * p_j + omega * s_j
+      // r_{j+1} = s_j - omega * t_j
+      // p_{j+1} = r_{j+1} + beta * (p_j - omega * q_j)
+      // and compute first stage of r_dot_r0 = <r_{j+1}, r_o^*> for use in next iteration
+       viennacl::linalg::pipelined_bicgstab_vector_update(result, alpha, p, omega, s,
+                                                          residual, As,
+                                                          beta, Ap,
+                                                          r0star, inner_prod_buffer, buffer_size_per_vector);
+    }
+
+    //store last error estimate:
+    tag.error(residual_norm / norm_rhs_host);
+
+    return result;
+  }
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::compressed_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        bicgstab_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::coordinate_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        bicgstab_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::ell_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        bicgstab_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::sliced_ell_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        bicgstab_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::hyb_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        bicgstab_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  /** @brief Implementation of the unpreconditioned stabilized Bi-conjugate gradient solver
+  *
+  * Following the description in "Iterative Methods for Sparse Linear Systems" by Y. Saad
+  *
+  * @param matrix       The system matrix
+  * @param rhs          The load vector
+  * @param tag          Solver configuration tag
+  * @param monitor      A callback routine which is called at each GMRES restart
+  * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data
+  * @return The result vector
+  */
+  template<typename MatrixT, typename VectorT>
+  VectorT solve_impl(MatrixT const & matrix,
+                     VectorT const & rhs,
+                     bicgstab_tag const & tag,
+                     viennacl::linalg::no_precond,
+                     bool (*monitor)(VectorT const &, typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<VectorT>::type>::type, void*) = NULL,
+                     void *monitor_data = NULL)
+  {
+    typedef typename viennacl::result_of::value_type<VectorT>::type            NumericType;
+    typedef typename viennacl::result_of::cpu_value_type<NumericType>::type    CPU_NumericType;
+    VectorT result = rhs;
+    viennacl::traits::clear(result);
+
+    VectorT residual = rhs;
+    VectorT p = rhs;
+    VectorT r0star = rhs;
+    VectorT tmp0 = rhs;
+    VectorT tmp1 = rhs;
+    VectorT s = rhs;
+
+    CPU_NumericType norm_rhs_host = viennacl::linalg::norm_2(residual);
+    CPU_NumericType ip_rr0star = norm_rhs_host * norm_rhs_host;
+    CPU_NumericType beta;
+    CPU_NumericType alpha;
+    CPU_NumericType omega;
+    //ScalarType inner_prod_temp; //temporary variable for inner product computation
+    CPU_NumericType new_ip_rr0star = 0;
+    CPU_NumericType residual_norm = norm_rhs_host;
+
+    if (norm_rhs_host <= tag.abs_tolerance()) //solution is zero if RHS norm is zero
+      return result;
+
+    bool restart_flag = true;
+    vcl_size_t last_restart = 0;
+    for (vcl_size_t i = 0; i < tag.max_iterations(); ++i)
+    {
+      if (restart_flag)
+      {
+        residual = viennacl::linalg::prod(matrix, result);
+        residual = rhs - residual;
+        p = residual;
+        r0star = residual;
+        ip_rr0star = viennacl::linalg::norm_2(residual);
+        ip_rr0star *= ip_rr0star;
+        restart_flag = false;
+        last_restart = i;
+      }
+
+      tag.iters(i+1);
+      tmp0 = viennacl::linalg::prod(matrix, p);
+      alpha = ip_rr0star / viennacl::linalg::inner_prod(tmp0, r0star);
+
+      s = residual - alpha*tmp0;
+
+      tmp1 = viennacl::linalg::prod(matrix, s);
+      CPU_NumericType norm_tmp1 = viennacl::linalg::norm_2(tmp1);
+      omega = viennacl::linalg::inner_prod(tmp1, s) / (norm_tmp1 * norm_tmp1);
+
+      result += alpha * p + omega * s;
+      residual = s - omega * tmp1;
+
+      new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);
+      residual_norm = viennacl::linalg::norm_2(residual);
+      if (monitor && monitor(result, std::fabs(residual_norm / norm_rhs_host), monitor_data))
+        break;
+      if (std::fabs(residual_norm / norm_rhs_host) < tag.tolerance() || residual_norm < tag.abs_tolerance())
+        break;
+
+      beta = new_ip_rr0star / ip_rr0star * alpha/omega;
+      ip_rr0star = new_ip_rr0star;
+
+      if (    (ip_rr0star <= 0 && ip_rr0star >= 0)
+           || (omega <= 0 && omega >= 0)
+           || (i - last_restart > tag.max_iterations_before_restart())
+         ) //search direction degenerate. A restart might help
+        restart_flag = true;
+
+      // Execution of
+      //  p = residual + beta * (p - omega*tmp0);
+      // without introducing temporary vectors:
+      p -= omega * tmp0;
+      p = residual + beta * p;
+    }
+
+    //store last error estimate:
+    tag.error(residual_norm / norm_rhs_host);
+
+    return result;
+  }
+
+
+  /** @brief Implementation of the preconditioned stabilized Bi-conjugate gradient solver
+  *
+  * Following the description of the unpreconditioned case in "Iterative Methods for Sparse Linear Systems" by Y. Saad
+  *
+  * @param matrix       The system matrix
+  * @param rhs          The load vector
+  * @param tag          Solver configuration tag
+  * @param precond      A preconditioner. Precondition operation is done via member function apply()
+  * @param monitor      A callback routine which is called at each GMRES restart
+  * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data
+  * @return The result vector
+  */
+  template<typename MatrixT, typename VectorT, typename PreconditionerT>
+  VectorT solve_impl(MatrixT const & matrix,
+                     VectorT const & rhs,
+                     bicgstab_tag const & tag,
+                     PreconditionerT const & precond,
+                     bool (*monitor)(VectorT const &, typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<VectorT>::type>::type, void*) = NULL,
+                     void *monitor_data = NULL)
+  {
+    typedef typename viennacl::result_of::value_type<VectorT>::type            NumericType;
+    typedef typename viennacl::result_of::cpu_value_type<NumericType>::type    CPU_NumericType;
+    VectorT result = rhs;
+    viennacl::traits::clear(result);
+
+    VectorT residual = rhs;
+    VectorT r0star = residual;  //can be chosen arbitrarily in fact
+    VectorT tmp0 = rhs;
+    VectorT tmp1 = rhs;
+    VectorT s = rhs;
+
+    VectorT p = residual;
+
+    CPU_NumericType ip_rr0star = viennacl::linalg::norm_2(residual);
+    CPU_NumericType norm_rhs_host = viennacl::linalg::norm_2(residual);
+    CPU_NumericType beta;
+    CPU_NumericType alpha;
+    CPU_NumericType omega;
+    CPU_NumericType new_ip_rr0star = 0;
+    CPU_NumericType residual_norm = norm_rhs_host;
+
+    if (norm_rhs_host <= tag.abs_tolerance()) //solution is zero if RHS norm is zero
+      return result;
+
+    bool restart_flag = true;
+    vcl_size_t last_restart = 0;
+    for (unsigned int i = 0; i < tag.max_iterations(); ++i)
+    {
+      if (restart_flag)
+      {
+        residual = viennacl::linalg::prod(matrix, result);
+        residual = rhs - residual;
+        precond.apply(residual);
+        p = residual;
+        r0star = residual;
+        ip_rr0star = viennacl::linalg::norm_2(residual);
+        ip_rr0star *= ip_rr0star;
+        restart_flag = false;
+        last_restart = i;
+      }
+
+      tag.iters(i+1);
+      tmp0 = viennacl::linalg::prod(matrix, p);
+      precond.apply(tmp0);
+      alpha = ip_rr0star / viennacl::linalg::inner_prod(tmp0, r0star);
+
+      s = residual - alpha*tmp0;
+
+      tmp1 = viennacl::linalg::prod(matrix, s);
+      precond.apply(tmp1);
+      CPU_NumericType norm_tmp1 = viennacl::linalg::norm_2(tmp1);
+      omega = viennacl::linalg::inner_prod(tmp1, s) / (norm_tmp1 * norm_tmp1);
+
+      result += alpha * p + omega * s;
+      residual = s - omega * tmp1;
+
+      residual_norm = viennacl::linalg::norm_2(residual);
+      if (monitor && monitor(result, std::fabs(residual_norm / norm_rhs_host), monitor_data))
+        break;
+      if (residual_norm / norm_rhs_host < tag.tolerance() || residual_norm < tag.abs_tolerance())
+        break;
+
+      new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);
+
+      beta = new_ip_rr0star / ip_rr0star * alpha/omega;
+      ip_rr0star = new_ip_rr0star;
+
+      if ( (ip_rr0star >= 0 && ip_rr0star <= 0) || (omega >=0 && omega <= 0) || i - last_restart > tag.max_iterations_before_restart()) //search direction degenerate. A restart might help
+        restart_flag = true;
+
+      // Execution of
+      //  p = residual + beta * (p - omega*tmp0);
+      // without introducing temporary vectors:
+      p -= omega * tmp0;
+      p = residual + beta * p;
+
+      //std::cout << "Rel. Residual in current step: " << std::sqrt(std::fabs(viennacl::linalg::inner_prod(residual, residual) / norm_rhs_host)) << std::endl;
+    }
+
+    //store last error estimate:
+    tag.error(residual_norm / norm_rhs_host);
+
+    return result;
+  }
+
+}
+
+
+
+template<typename MatrixT, typename VectorT, typename PreconditionerT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, bicgstab_tag const & tag, PreconditionerT const & precond)
+{
+  return detail::solve_impl(matrix, rhs, tag, precond);
+}
+
+
+/** @brief Convenience overload for calling the preconditioned BiCGStab solver using types from the C++ STL.
+  *
+  * A std::vector<std::map<T, U> > matrix is convenient for e.g. finite element assembly.
+  * It is not the fastest option for setting up a system, but often it is fast enough - particularly for just trying things out.
+  */
+template<typename IndexT, typename NumericT, typename PreconditionerT>
+std::vector<NumericT> solve(std::vector< std::map<IndexT, NumericT> > const & A, std::vector<NumericT> const & rhs, bicgstab_tag const & tag, PreconditionerT const & precond)
+{
+  viennacl::compressed_matrix<NumericT> vcl_A;
+  viennacl::copy(A, vcl_A);
+
+  viennacl::vector<NumericT> vcl_rhs(rhs.size());
+  viennacl::copy(rhs, vcl_rhs);
+
+  viennacl::vector<NumericT> vcl_result = solve(vcl_A, vcl_rhs, tag, precond);
+
+  std::vector<NumericT> result(vcl_result.size());
+  viennacl::copy(vcl_result, result);
+  return result;
+}
+
+/** @brief Entry point for the unpreconditioned BiCGStab method.
+ *
+ *  @param matrix    The system matrix
+ *  @param rhs       Right hand side vector (load vector)
+ *  @param tag       A BiCGStab tag providing relative tolerances, etc.
+ */
+template<typename MatrixT, typename VectorT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, bicgstab_tag const & tag)
+{
+  return solve(matrix, rhs, tag, viennacl::linalg::no_precond());
+}
+
+
+
+template<typename VectorT>
+class bicgstab_solver
+{
+public:
+  typedef typename viennacl::result_of::cpu_value_type<VectorT>::type   numeric_type;
+
+  bicgstab_solver(bicgstab_tag const & tag) : tag_(tag), monitor_callback_(NULL), user_data_(NULL) {}
+
+  template<typename MatrixT, typename PreconditionerT>
+  VectorT operator()(MatrixT const & A, VectorT const & b, PreconditionerT const & precond) const
+  {
+    if (viennacl::traits::size(init_guess_) > 0) // take initial guess into account
+    {
+      VectorT mod_rhs = viennacl::linalg::prod(A, init_guess_);
+      mod_rhs = b - mod_rhs;
+      VectorT y = detail::solve_impl(A, mod_rhs, tag_, precond, monitor_callback_, user_data_);
+      return init_guess_ + y;
+    }
+    return detail::solve_impl(A, b, tag_, precond, monitor_callback_, user_data_);
+  }
+
+
+  template<typename MatrixT>
+  VectorT operator()(MatrixT const & A, VectorT const & b) const
+  {
+    return operator()(A, b, viennacl::linalg::no_precond());
+  }
+
+  /** @brief Specifies an initial guess for the iterative solver.
+    *
+    * An iterative solver for Ax = b with initial guess x_0 is equivalent to an iterative solver for Ay = b' := b - Ax_0, where x = x_0 + y.
+    */
+  void set_initial_guess(VectorT const & x) { init_guess_ = x; }
+
+  /** @brief Sets a monitor function pointer to be called in each iteration. Set to NULL to run without monitor.
+   *
+   *  The monitor function is called with the current guess for the result as first argument and the current relative residual estimate as second argument.
+   *  The third argument is a pointer to user-defined data, through which additional information can be passed.
+   *  This pointer needs to be set with set_monitor_data. If not set, NULL is passed.
+   *  If the montior function returns true, the solver terminates (either convergence or divergence).
+   */
+  void set_monitor(bool (*monitor_fun)(VectorT const &, numeric_type, void *), void *user_data)
+  {
+    monitor_callback_ = monitor_fun;
+    user_data_ = user_data;
+  }
+
+  /** @brief Returns the solver tag containing basic configuration such as tolerances, etc. */
+  bicgstab_tag const & tag() const { return tag_; }
+
+private:
+  bicgstab_tag  tag_;
+  VectorT       init_guess_;
+  bool          (*monitor_callback_)(VectorT const &, numeric_type, void *);
+  void          *user_data_;
+};
+
+
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/bisect.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/bisect.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/bisect.hpp
new file mode 100644
index 0000000..a2daf5e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/bisect.hpp
@@ -0,0 +1,179 @@
+#ifndef VIENNACL_LINALG_BISECT_HPP_
+#define VIENNACL_LINALG_BISECT_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/bisect.hpp
+*   @brief Implementation of the algorithm for finding eigenvalues of a tridiagonal matrix.
+*
+*   Contributed by Guenther Mader and Astrid Rupp.
+*/
+
+#include <vector>
+#include <cmath>
+#include <limits>
+#include <cstddef>
+#include "viennacl/meta/result_of.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+namespace detail
+{
+  /**
+  *    @brief overloaded function for copying vectors
+  */
+  template<typename NumericT, typename OtherVectorT>
+  void copy_vec_to_vec(viennacl::vector<NumericT> const & src, OtherVectorT & dest)
+  {
+    viennacl::copy(src, dest);
+  }
+
+  template<typename OtherVectorT, typename NumericT>
+  void copy_vec_to_vec(OtherVectorT const & src, viennacl::vector<NumericT> & dest)
+  {
+    viennacl::copy(src, dest);
+  }
+
+  template<typename VectorT1, typename VectorT2>
+  void copy_vec_to_vec(VectorT1 const & src, VectorT2 & dest)
+  {
+    for (vcl_size_t i=0; i<src.size(); ++i)
+      dest[i] = src[i];
+  }
+
+} //namespace detail
+
+/**
+*   @brief Implementation of the bisect-algorithm for the calculation of the eigenvalues of a tridiagonal matrix. Experimental - interface might change.
+*
+*   Refer to "Calculation of the Eigenvalues of a Symmetric Tridiagonal Matrix by the Method of Bisection" in the Handbook Series Linear Algebra, contributed by Barth, Martin, and Wilkinson.
+*   http://www.maths.ed.ac.uk/~aar/papers/bamawi.pdf
+*
+*   @param alphas       Elements of the main diagonal
+*   @param betas        Elements of the secondary diagonal
+*   @return             Returns the eigenvalues of the tridiagonal matrix defined by alpha and beta
+*/
+template<typename VectorT>
+std::vector<
+        typename viennacl::result_of::cpu_value_type<typename VectorT::value_type>::type
+        >
+bisect(VectorT const & alphas, VectorT const & betas)
+{
+  typedef typename viennacl::result_of::value_type<VectorT>::type           NumericType;
+  typedef typename viennacl::result_of::cpu_value_type<NumericType>::type   CPU_NumericType;
+
+  vcl_size_t size = betas.size();
+  std::vector<CPU_NumericType>  x_temp(size);
+
+
+  std::vector<CPU_NumericType> beta_bisect;
+  std::vector<CPU_NumericType> wu;
+
+  double rel_error = std::numeric_limits<CPU_NumericType>::epsilon();
+  beta_bisect.push_back(0);
+
+  for (vcl_size_t i = 1; i < size; i++)
+    beta_bisect.push_back(betas[i] * betas[i]);
+
+  double xmin = alphas[size - 1] - std::fabs(betas[size - 1]);
+  double xmax = alphas[size - 1] + std::fabs(betas[size - 1]);
+
+  for (vcl_size_t i = 0; i < size - 1; i++)
+  {
+    double h = std::fabs(betas[i]) + std::fabs(betas[i + 1]);
+    if (alphas[i] + h > xmax)
+      xmax = alphas[i] + h;
+    if (alphas[i] - h < xmin)
+      xmin = alphas[i] - h;
+  }
+
+
+  double eps1 = 1e-6;
+  /*double eps2 = (xmin + xmax > 0) ? (rel_error * xmax) : (-rel_error * xmin);
+  if (eps1 <= 0)
+    eps1 = eps2;
+  else
+    eps2 = 0.5 * eps1 + 7.0 * eps2; */
+
+  double x0 = xmax;
+
+  for (vcl_size_t i = 0; i < size; i++)
+  {
+    x_temp[i] = xmax;
+    wu.push_back(xmin);
+  }
+
+  for (long k = static_cast<long>(size) - 1; k >= 0; --k)
+  {
+    double xu = xmin;
+    for (long i = k; i >= 0; --i)
+    {
+      if (xu < wu[vcl_size_t(k-i)])
+      {
+        xu = wu[vcl_size_t(i)];
+        break;
+      }
+    }
+
+    if (x0 > x_temp[vcl_size_t(k)])
+      x0 = x_temp[vcl_size_t(k)];
+
+    double x1 = (xu + x0) / 2.0;
+    while (x0 - xu > 2.0 * rel_error * (std::fabs(xu) + std::fabs(x0)) + eps1)
+    {
+      vcl_size_t a = 0;
+      double q = 1;
+      for (vcl_size_t i = 0; i < size; i++)
+      {
+        if (q > 0 || q < 0)
+          q = alphas[i] - x1 - beta_bisect[i] / q;
+        else
+          q = alphas[i] - x1 - std::fabs(betas[i] / rel_error);
+
+        if (q < 0)
+          a++;
+      }
+
+      if (a <= static_cast<vcl_size_t>(k))
+      {
+        xu = x1;
+        if (a < 1)
+          wu[0] = x1;
+        else
+        {
+          wu[a] = x1;
+          if (x_temp[a - 1] > x1)
+              x_temp[a - 1] = x1;
+        }
+      }
+      else
+        x0 = x1;
+
+      x1 = (xu + x0) / 2.0;
+    }
+    x_temp[vcl_size_t(k)] = x1;
+  }
+  return x_temp;
+}
+
+} // end namespace linalg
+} // end namespace viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/bisect_gpu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/bisect_gpu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/bisect_gpu.hpp
new file mode 100644
index 0000000..6918b14
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/bisect_gpu.hpp
@@ -0,0 +1,173 @@
+#ifndef VIENNACL_LINALG_BISECT_GPU
+#define VIENNACL_LINALG_BISECT_GPU
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/bisect_gpu.hpp
+    @brief Implementation of an bisection algorithm for eigenvalues
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+// includes, system
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+#include "viennacl/linalg/detail/bisect/gerschgorin.hpp"
+#include "viennacl/linalg/detail/bisect/bisect_large.hpp"
+#include "viennacl/linalg/detail/bisect/bisect_small.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+///////////////////////////////////////////////////////////////////////////
+//! @brief bisect           The bisection algorithm computes the eigevalues
+//!                         of a symmetric tridiagonal matrix.
+//! @param diagonal         diagonal elements of the matrix
+//! @param superdiagonal    superdiagonal elements of the matrix
+//! @param eigenvalues      Vectors with the eigenvalues in ascending order
+//! @return                 return false if any errors occured
+///
+//! overloaded function template: std::vectors as parameters
+template<typename NumericT>
+bool
+bisect(const std::vector<NumericT> & diagonal, const std::vector<NumericT> & superdiagonal, std::vector<NumericT> & eigenvalues)
+{
+  assert(diagonal.size() == superdiagonal.size() &&
+         diagonal.size() == eigenvalues.size()   &&
+         bool("Input vectors do not have the same sizes!"));
+  bool bResult = false;
+  // flag if the matrix size is due to explicit user request
+  // desired precision of eigenvalues
+  NumericT  precision = static_cast<NumericT>(0.00001);
+  const unsigned int mat_size = static_cast<unsigned int>(diagonal.size());
+
+  // set up input
+  viennacl::linalg::detail::InputData<NumericT> input(diagonal, superdiagonal, mat_size);
+
+  NumericT lg =  FLT_MAX;
+  NumericT ug = -FLT_MAX;
+  // compute Gerschgorin interval
+  viennacl::linalg::detail::computeGerschgorin(input.std_a, input.std_b, mat_size, lg, ug);
+
+  // decide wheter the algorithm for small or for large matrices will be started
+  if (mat_size <= VIENNACL_BISECT_MAX_SMALL_MATRIX)
+  {
+    // initialize memory for result
+    viennacl::linalg::detail::ResultDataSmall<NumericT> result(mat_size);
+
+    // run the kernel
+    viennacl::linalg::detail::computeEigenvaluesSmallMatrix(input, result, mat_size, lg, ug, precision);
+
+    // get the result from the device and do some sanity checks,
+    viennacl::linalg::detail::processResultSmallMatrix(result, mat_size);
+    eigenvalues = result.std_eigenvalues;
+    bResult = true;
+  }
+
+  else
+  {
+    // initialize memory for result
+    viennacl::linalg::detail::ResultDataLarge<NumericT> result(mat_size);
+
+    // run the kernel
+    viennacl::linalg::detail::computeEigenvaluesLargeMatrix(input, result, mat_size, lg, ug, precision);
+
+    // get the result from the device and do some sanity checks
+    bResult = viennacl::linalg::detail::processResultDataLargeMatrix(result, mat_size);
+
+    eigenvalues = result.std_eigenvalues;
+  }
+  return bResult;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+//! @brief bisect           The bisection algorithm computes the eigevalues
+//!                         of a symmetric tridiagonal matrix.
+//! @param diagonal         diagonal elements of the matrix
+//! @param superdiagonal    superdiagonal elements of the matrix
+//! @param eigenvalues      Vectors with the eigenvalues in ascending order
+//! @return                 return false if any errors occured
+///
+//! overloaded function template: viennacl::vectors as parameters
+template<typename NumericT>
+bool
+bisect(const viennacl::vector<NumericT> & diagonal, const viennacl::vector<NumericT> & superdiagonal, viennacl::vector<NumericT> & eigenvalues)
+{
+  assert(diagonal.size() == superdiagonal.size() &&
+         diagonal.size() == eigenvalues.size()   &&
+         bool("Input vectors do not have the same sizes!"));
+  bool bResult = false;
+  // flag if the matrix size is due to explicit user request
+  // desired precision of eigenvalues
+  NumericT  precision = static_cast<NumericT>(0.00001);
+  const unsigned int mat_size = static_cast<unsigned int>(diagonal.size());
+
+  // set up input
+  viennacl::linalg::detail::InputData<NumericT> input(diagonal, superdiagonal, mat_size);
+
+  NumericT lg =  FLT_MAX;
+  NumericT ug = -FLT_MAX;
+  // compute Gerschgorin interval
+  viennacl::linalg::detail::computeGerschgorin(input.std_a, input.std_b, mat_size, lg, ug);
+
+  // decide wheter the algorithm for small or for large matrices will be started
+  if (mat_size <= VIENNACL_BISECT_MAX_SMALL_MATRIX)
+  {
+    // initialize memory for result
+    viennacl::linalg::detail::ResultDataSmall<NumericT> result(mat_size);
+
+    // run the kernel
+    viennacl::linalg::detail::computeEigenvaluesSmallMatrix(input, result, mat_size, lg, ug, precision);
+
+    // get the result from the device and do some sanity checks,
+    viennacl::linalg::detail::processResultSmallMatrix(result, mat_size);
+    copy(result.std_eigenvalues, eigenvalues);
+    bResult = true;
+  }
+
+  else
+  {
+    // initialize memory for result
+    viennacl::linalg::detail::ResultDataLarge<NumericT> result(mat_size);
+
+    // run the kernel
+    viennacl::linalg::detail::computeEigenvaluesLargeMatrix(input, result, mat_size, lg, ug, precision);
+
+    // get the result from the device and do some sanity checks
+    bResult = viennacl::linalg::detail::processResultDataLargeMatrix(result, mat_size);
+
+    copy(result.std_eigenvalues, eigenvalues);
+  }
+  return bResult;
+}
+} // namespace linalg
+} // namespace viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cg.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cg.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cg.hpp
new file mode 100644
index 0000000..93aae81
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cg.hpp
@@ -0,0 +1,440 @@
+#ifndef VIENNACL_LINALG_CG_HPP_
+#define VIENNACL_LINALG_CG_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cg.hpp
+    @brief The conjugate gradient method is implemented here
+*/
+
+#include <vector>
+#include <map>
+#include <cmath>
+#include <numeric>
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/traits/clear.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/linalg/iterative_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for the conjugate gradient Used for supplying solver parameters and for dispatching the solve() function
+*/
+class cg_tag
+{
+public:
+  /** @brief The constructor
+  *
+  * @param tol              Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||)
+  * @param max_iterations   The maximum number of iterations
+  */
+  cg_tag(double tol = 1e-8, unsigned int max_iterations = 300) : tol_(tol), abs_tol_(0), iterations_(max_iterations) {}
+
+  /** @brief Returns the relative tolerance */
+  double tolerance() const { return tol_; }
+
+  /** @brief Returns the absolute tolerance */
+  double abs_tolerance() const { return abs_tol_; }
+  /** @brief Sets the absolute tolerance */
+  void abs_tolerance(double new_tol) { if (new_tol >= 0) abs_tol_ = new_tol; }
+
+  /** @brief Returns the maximum number of iterations */
+  unsigned int max_iterations() const { return iterations_; }
+
+  /** @brief Return the number of solver iterations: */
+  unsigned int iters() const { return iters_taken_; }
+  void iters(unsigned int i) const { iters_taken_ = i; }
+
+  /** @brief Returns the estimated relative error at the end of the solver run */
+  double error() const { return last_error_; }
+  /** @brief Sets the estimated relative error at the end of the solver run */
+  void error(double e) const { last_error_ = e; }
+
+
+private:
+  double tol_;
+  double abs_tol_;
+  unsigned int iterations_;
+
+  //return values from solver
+  mutable unsigned int iters_taken_;
+  mutable double last_error_;
+};
+
+namespace detail
+{
+
+  /** @brief handles the no_precond case at minimal overhead */
+  template<typename VectorT, typename PreconditionerT>
+  class z_handler{
+  public:
+    z_handler(VectorT & residual) : z_(residual){ }
+    VectorT & get() { return z_; }
+  private:
+    VectorT z_;
+  };
+
+  template<typename VectorT>
+  class z_handler<VectorT, viennacl::linalg::no_precond>{
+  public:
+    z_handler(VectorT & residual) : presidual_(&residual){ }
+    VectorT & get() { return *presidual_; }
+  private:
+    VectorT * presidual_;
+  };
+
+}
+
+namespace detail
+{
+
+  /** @brief Implementation of a pipelined conjugate gradient algorithm (no preconditioner), specialized for ViennaCL types.
+  *
+  * Pipelined version from A. T. Chronopoulos and C. W. Gear, J. Comput. Appl. Math. 25(2), 153\u2013168 (1989)
+  *
+  * @param A            The system matrix
+  * @param rhs          The load vector
+  * @param tag          Solver configuration tag
+  * @param monitor      A callback routine which is called at each GMRES restart
+  * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data
+  * @return The result vector
+  */
+  //template<typename MatrixType, typename ScalarType>
+  template<typename MatrixT, typename NumericT>
+  viennacl::vector<NumericT> pipelined_solve(MatrixT const & A, //MatrixType const & A,
+                                             viennacl::vector<NumericT> const & rhs,
+                                             cg_tag const & tag,
+                                             viennacl::linalg::no_precond,
+                                             bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                             void *monitor_data = NULL)
+  {
+    typedef typename viennacl::vector<NumericT>::difference_type   difference_type;
+
+    viennacl::vector<NumericT> result(rhs);
+    viennacl::traits::clear(result);
+
+    viennacl::vector<NumericT> residual(rhs);
+    viennacl::vector<NumericT> p(rhs);
+    viennacl::vector<NumericT> Ap = viennacl::linalg::prod(A, p);
+    viennacl::vector<NumericT> inner_prod_buffer = viennacl::zero_vector<NumericT>(3*256, viennacl::traits::context(rhs)); // temporary buffer
+    std::vector<NumericT>      host_inner_prod_buffer(inner_prod_buffer.size());
+    vcl_size_t                 buffer_size_per_vector = inner_prod_buffer.size() / 3;
+    difference_type            buffer_offset_per_vector = static_cast<difference_type>(buffer_size_per_vector);
+
+    NumericT norm_rhs_squared = viennacl::linalg::norm_2(residual); norm_rhs_squared *= norm_rhs_squared;
+
+    if (norm_rhs_squared <= tag.abs_tolerance() * tag.abs_tolerance()) //check for early convergence of A*x = 0
+      return result;
+
+    NumericT inner_prod_rr = norm_rhs_squared;
+    NumericT alpha = inner_prod_rr / viennacl::linalg::inner_prod(p, Ap);
+    NumericT beta  = viennacl::linalg::norm_2(Ap); beta = (alpha * alpha * beta * beta - inner_prod_rr) / inner_prod_rr;
+    NumericT inner_prod_ApAp = 0;
+    NumericT inner_prod_pAp  = 0;
+
+    for (unsigned int i = 0; i < tag.max_iterations(); ++i)
+    {
+      tag.iters(i+1);
+
+      viennacl::linalg::pipelined_cg_vector_update(result, alpha, p, residual, Ap, beta, inner_prod_buffer);
+      viennacl::linalg::pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+
+      // bring back the partial results to the host:
+      viennacl::fast_copy(inner_prod_buffer.begin(), inner_prod_buffer.end(), host_inner_prod_buffer.begin());
+
+      inner_prod_rr   = std::accumulate(host_inner_prod_buffer.begin(),                                host_inner_prod_buffer.begin() +     buffer_offset_per_vector, NumericT(0));
+      inner_prod_ApAp = std::accumulate(host_inner_prod_buffer.begin() +     buffer_offset_per_vector, host_inner_prod_buffer.begin() + 2 * buffer_offset_per_vector, NumericT(0));
+      inner_prod_pAp  = std::accumulate(host_inner_prod_buffer.begin() + 2 * buffer_offset_per_vector, host_inner_prod_buffer.begin() + 3 * buffer_offset_per_vector, NumericT(0));
+
+      if (monitor && monitor(result, std::sqrt(std::fabs(inner_prod_rr / norm_rhs_squared)), monitor_data))
+        break;
+      if (std::fabs(inner_prod_rr / norm_rhs_squared) < tag.tolerance() *  tag.tolerance() || std::fabs(inner_prod_rr) < tag.abs_tolerance() * tag.abs_tolerance())    //squared norms involved here
+        break;
+
+      alpha = inner_prod_rr / inner_prod_pAp;
+      beta  = (alpha*alpha*inner_prod_ApAp - inner_prod_rr) / inner_prod_rr;
+    }
+
+    //store last error estimate:
+    tag.error(std::sqrt(std::fabs(inner_prod_rr) / norm_rhs_squared));
+
+    return result;
+  }
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::compressed_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        cg_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::coordinate_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        cg_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::ell_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        cg_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::sliced_ell_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        cg_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::hyb_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        cg_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  template<typename MatrixT, typename VectorT, typename PreconditionerT>
+  VectorT solve_impl(MatrixT const & matrix,
+                     VectorT const & rhs,
+                     cg_tag const & tag,
+                     PreconditionerT const & precond,
+                     bool (*monitor)(VectorT const &, typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<VectorT>::type>::type, void*) = NULL,
+                     void *monitor_data = NULL)
+  {
+    typedef typename viennacl::result_of::value_type<VectorT>::type           NumericType;
+    typedef typename viennacl::result_of::cpu_value_type<NumericType>::type   CPU_NumericType;
+
+    VectorT result = rhs;
+    viennacl::traits::clear(result);
+
+    VectorT residual = rhs;
+    VectorT tmp = rhs;
+    detail::z_handler<VectorT, PreconditionerT> zhandler(residual);
+    VectorT & z = zhandler.get();
+
+    precond.apply(z);
+    VectorT p = z;
+
+    CPU_NumericType ip_rr = viennacl::linalg::inner_prod(residual, z);
+    CPU_NumericType alpha;
+    CPU_NumericType new_ip_rr = 0;
+    CPU_NumericType beta;
+    CPU_NumericType norm_rhs_squared = ip_rr;
+    CPU_NumericType new_ipp_rr_over_norm_rhs;
+
+    if (norm_rhs_squared <= tag.abs_tolerance() * tag.abs_tolerance()) //solution is zero if RHS norm (squared) is zero
+      return result;
+
+    for (unsigned int i = 0; i < tag.max_iterations(); ++i)
+    {
+      tag.iters(i+1);
+      tmp = viennacl::linalg::prod(matrix, p);
+
+      alpha = ip_rr / viennacl::linalg::inner_prod(tmp, p);
+
+      result += alpha * p;
+      residual -= alpha * tmp;
+      z = residual;
+      precond.apply(z);
+
+      if (static_cast<VectorT*>(&residual)==static_cast<VectorT*>(&z))
+        new_ip_rr = std::pow(viennacl::linalg::norm_2(residual),2);
+      else
+        new_ip_rr = viennacl::linalg::inner_prod(residual, z);
+
+      new_ipp_rr_over_norm_rhs = new_ip_rr / norm_rhs_squared;
+      if (monitor && monitor(result, std::sqrt(std::fabs(new_ipp_rr_over_norm_rhs)), monitor_data))
+        break;
+      if (std::fabs(new_ipp_rr_over_norm_rhs) < tag.tolerance() *  tag.tolerance() || std::fabs(new_ip_rr) < tag.abs_tolerance() * tag.abs_tolerance())    //squared norms involved here
+        break;
+
+      beta = new_ip_rr / ip_rr;
+      ip_rr = new_ip_rr;
+
+      p = z + beta*p;
+    }
+
+    //store last error estimate:
+    tag.error(std::sqrt(std::fabs(new_ip_rr / norm_rhs_squared)));
+
+    return result;
+  }
+
+}
+
+
+
+/** @brief Implementation of the preconditioned conjugate gradient solver, generic implementation for non-ViennaCL types.
+*
+* Following Algorithm 9.1 in "Iterative Methods for Sparse Linear Systems" by Y. Saad
+*
+* @param matrix     The system matrix
+* @param rhs        The load vector
+* @param tag        Solver configuration tag
+* @param precond    A preconditioner. Precondition operation is done via member function apply()
+* @return The result vector
+*/
+template<typename MatrixT, typename VectorT, typename PreconditionerT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, cg_tag const & tag, PreconditionerT const & precond)
+{
+  return detail::solve_impl(matrix, rhs, tag, precond);
+}
+
+/** @brief Convenience overload for calling the CG solver using types from the C++ STL.
+  *
+  * A std::vector<std::map<T, U> > matrix is convenient for e.g. finite element assembly.
+  * It is not the fastest option for setting up a system, but often it is fast enough - particularly for just trying things out.
+  */
+template<typename IndexT, typename NumericT, typename PreconditionerT>
+std::vector<NumericT> solve(std::vector< std::map<IndexT, NumericT> > const & A, std::vector<NumericT> const & rhs, cg_tag const & tag, PreconditionerT const & precond)
+{
+  viennacl::compressed_matrix<NumericT> vcl_A;
+  viennacl::copy(A, vcl_A);
+
+  viennacl::vector<NumericT> vcl_rhs(rhs.size());
+  viennacl::copy(rhs, vcl_rhs);
+
+  viennacl::vector<NumericT> vcl_result = solve(vcl_A, vcl_rhs, tag, precond);
+
+  std::vector<NumericT> result(vcl_result.size());
+  viennacl::copy(vcl_result, result);
+  return result;
+}
+
+/** @brief Entry point for the unpreconditioned CG method.
+ *
+ *  @param matrix    The system matrix
+ *  @param rhs       Right hand side vector (load vector)
+ *  @param tag       A BiCGStab tag providing relative tolerances, etc.
+ */
+template<typename MatrixT, typename VectorT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, cg_tag const & tag)
+{
+  return solve(matrix, rhs, tag, viennacl::linalg::no_precond());
+}
+
+
+
+template<typename VectorT>
+class cg_solver
+{
+public:
+  typedef typename viennacl::result_of::cpu_value_type<VectorT>::type   numeric_type;
+
+  cg_solver(cg_tag const & tag) : tag_(tag), monitor_callback_(NULL), user_data_(NULL) {}
+
+  template<typename MatrixT, typename PreconditionerT>
+  VectorT operator()(MatrixT const & A, VectorT const & b, PreconditionerT const & precond) const
+  {
+    if (viennacl::traits::size(init_guess_) > 0) // take initial guess into account
+    {
+      VectorT mod_rhs = viennacl::linalg::prod(A, init_guess_);
+      mod_rhs = b - mod_rhs;
+      VectorT y = detail::solve_impl(A, mod_rhs, tag_, precond, monitor_callback_, user_data_);
+      return init_guess_ + y;
+    }
+    return detail::solve_impl(A, b, tag_, precond, monitor_callback_, user_data_);
+  }
+
+
+  template<typename MatrixT>
+  VectorT operator()(MatrixT const & A, VectorT const & b) const
+  {
+    return operator()(A, b, viennacl::linalg::no_precond());
+  }
+
+  /** @brief Specifies an initial guess for the iterative solver.
+    *
+    * An iterative solver for Ax = b with initial guess x_0 is equivalent to an iterative solver for Ay = b' := b - Ax_0, where x = x_0 + y.
+    */
+  void set_initial_guess(VectorT const & x) { init_guess_ = x; }
+
+  /** @brief Sets a monitor function pointer to be called in each iteration. Set to NULL to run without monitor.
+   *
+   *  The monitor function is called with the current guess for the result as first argument and the current relative residual estimate as second argument.
+   *  The third argument is a pointer to user-defined data, through which additional information can be passed.
+   *  This pointer needs to be set with set_monitor_data. If not set, NULL is passed.
+   *  If the montior function returns true, the solver terminates (either convergence or divergence).
+   */
+  void set_monitor(bool (*monitor_fun)(VectorT const &, numeric_type, void *), void *user_data)
+  {
+    monitor_callback_ = monitor_fun;
+    user_data_ = user_data;
+  }
+
+  /** @brief Returns the solver tag containing basic configuration such as tolerances, etc. */
+  cg_tag const & tag() const { return tag_; }
+
+private:
+  cg_tag   tag_;
+  VectorT  init_guess_;
+  bool     (*monitor_callback_)(VectorT const &, numeric_type, void *);
+  void     *user_data_;
+};
+
+
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/circulant_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/circulant_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/circulant_matrix_operations.hpp
new file mode 100644
index 0000000..5325b7b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/circulant_matrix_operations.hpp
@@ -0,0 +1,75 @@
+#ifndef VIENNACL_LINALG_CIRCULANT_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CIRCULANT_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/circulant_matrix_operations.hpp
+    @brief Implementations of operations using circulant_matrix. Experimental.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/fft.hpp"
+//#include "viennacl/linalg/kernels/coordinate_matrix_kernels.h"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+// A * x
+
+/** @brief Carries out matrix-vector multiplication with a circulant_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::circulant_matrix<NumericT, AlignmentV> const & mat,
+               viennacl::vector_base<NumericT> const & vec,
+               viennacl::vector_base<NumericT>       & result)
+{
+  assert(mat.size1() == result.size() && bool("Dimension mismatch"));
+  assert(mat.size2() == vec.size() && bool("Dimension mismatch"));
+  //result.clear();
+
+  //std::cout << "prod(circulant_matrix" << ALIGNMENT << ", vector) called with internal_nnz=" << mat.internal_nnz() << std::endl;
+
+  viennacl::vector<NumericT> circ(mat.elements().size() * 2);
+  viennacl::linalg::real_to_complex(mat.elements(), circ, mat.elements().size());
+
+  viennacl::vector<NumericT> tmp(vec.size() * 2);
+  viennacl::vector<NumericT> tmp2(vec.size() * 2);
+
+  viennacl::linalg::real_to_complex(vec, tmp, vec.size());
+  viennacl::linalg::convolve(circ, tmp, tmp2);
+  viennacl::linalg::complex_to_real(tmp2, result, vec.size());
+
+}
+
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif