You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:39:58 UTC

[01/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Repository: mahout
Updated Branches:
  refs/heads/master 1fca0743a -> f7c1f8026


http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/qr-method-common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/qr-method-common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/qr-method-common.hpp
new file mode 100644
index 0000000..7250631
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/qr-method-common.hpp
@@ -0,0 +1,188 @@
+#ifndef VIENNACL_LINALG_QR_METHOD_COMMON_HPP
+#define VIENNACL_LINALG_QR_METHOD_COMMON_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <cmath>
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/linalg/opencl/kernels/svd.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+#include "viennacl/linalg/cuda/matrix_operations.hpp"
+#endif
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+//#include <boost/numeric/ublas/vector.hpp>
+//#include <boost/numeric/ublas/io.hpp>
+
+/** @file viennacl/linalg/qr-method-common.hpp
+    @brief Common routines used for the QR method and SVD. Experimental.
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+
+const std::string SVD_HOUSEHOLDER_UPDATE_QR_KERNEL = "house_update_QR";
+const std::string SVD_MATRIX_TRANSPOSE_KERNEL = "transpose_inplace";
+const std::string SVD_INVERSE_SIGNS_KERNEL = "inverse_signs";
+const std::string SVD_GIVENS_PREV_KERNEL = "givens_prev";
+const std::string SVD_FINAL_ITER_UPDATE_KERNEL = "final_iter_update";
+const std::string SVD_UPDATE_QR_COLUMN_KERNEL = "update_qr_column";
+const std::string SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL = "house_update_A_left";
+const std::string SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL = "house_update_A_right";
+const std::string SVD_HOUSEHOLDER_UPDATE_QL_KERNEL = "house_update_QL";
+
+namespace detail
+{
+static const double EPS = 1e-10;
+static const vcl_size_t ITER_MAX = 50;
+
+template <typename SCALARTYPE>
+SCALARTYPE pythag(SCALARTYPE a, SCALARTYPE b)
+{
+  return std::sqrt(a*a + b*b);
+}
+
+template <typename SCALARTYPE>
+SCALARTYPE sign(SCALARTYPE val)
+{
+    return (val >= 0) ? SCALARTYPE(1) : SCALARTYPE(-1);
+}
+
+// DEPRECATED: Replace with viennacl::linalg::norm_2
+template <typename VectorType>
+typename VectorType::value_type norm_lcl(VectorType const & x, vcl_size_t size)
+{
+  typename VectorType::value_type x_norm = 0.0;
+  for(vcl_size_t i = 0; i < size; i++)
+    x_norm += std::pow(x[i], 2);
+  return std::sqrt(x_norm);
+}
+
+template <typename VectorType>
+void normalize(VectorType & x, vcl_size_t size)
+{
+  typename VectorType::value_type x_norm = norm_lcl(x, size);
+  for(vcl_size_t i = 0; i < size; i++)
+      x[i] /= x_norm;
+}
+
+
+
+template <typename VectorType>
+void householder_vector(VectorType & v, vcl_size_t start)
+{
+  typedef typename VectorType::value_type    ScalarType;
+  ScalarType x_norm = norm_lcl(v, v.size());
+  ScalarType alpha = -sign(v[start]) * x_norm;
+  v[start] += alpha;
+  normalize(v, v.size());
+}
+
+template <typename SCALARTYPE>
+void transpose(matrix_base<SCALARTYPE> & A)
+{
+  (void)A;
+#ifdef VIENNACL_WITH_OPENCL
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  if(A.row_major())
+  {
+      viennacl::linalg::opencl::kernels::svd<SCALARTYPE, row_major>::init(ctx);
+      viennacl::ocl::kernel & kernel = viennacl::ocl::get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE, row_major>::program_name(), SVD_MATRIX_TRANSPOSE_KERNEL);
+
+      viennacl::ocl::enqueue(kernel(A,
+                                    static_cast<cl_uint>(A.internal_size1()),
+                                    static_cast<cl_uint>(A.internal_size2())
+                                   )
+                            );
+  }
+  else
+  {
+      viennacl::linalg::opencl::kernels::svd<SCALARTYPE, row_major>::init(ctx);
+      viennacl::ocl::kernel & kernel = viennacl::ocl::get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE, column_major>::program_name(), SVD_MATRIX_TRANSPOSE_KERNEL);
+
+      viennacl::ocl::enqueue(kernel(A,
+                                    static_cast<cl_uint>(A.internal_size1()),
+                                    static_cast<cl_uint>(A.internal_size2())
+                                   )
+                            );
+  }
+
+#endif
+}
+
+
+
+template <typename T>
+void cdiv(T xr, T xi, T yr, T yi, T& cdivr, T& cdivi)
+{
+    // Complex scalar division.
+    T r;
+    T d;
+    if (std::fabs(yr) > std::fabs(yi))
+    {
+        r = yi / yr;
+        d = yr + r * yi;
+        cdivr = (xr + r * xi) / d;
+        cdivi = (xi - r * xr) / d;
+    }
+    else
+    {
+        r = yr / yi;
+        d = yi + r * yr;
+        cdivr = (r * xr + xi) / d;
+        cdivi = (r * xi - xr) / d;
+    }
+}
+
+
+template<typename SCALARTYPE>
+void prepare_householder_vector(
+                              matrix_base<SCALARTYPE>& A,
+                              vector_base<SCALARTYPE>& D,
+                              vcl_size_t size,
+                              vcl_size_t row_start,
+                              vcl_size_t col_start,
+                              vcl_size_t start,
+                              bool is_column
+                              )
+{
+  //boost::numeric::ublas::vector<SCALARTYPE> tmp = boost::numeric::ublas::scalar_vector<SCALARTYPE>(size, 0);
+  std::vector<SCALARTYPE> tmp(size);
+  copy_vec(A, D, row_start, col_start, is_column);
+  fast_copy(D.begin(), D.begin() + vcl_ptrdiff_t(size - start), tmp.begin() + vcl_ptrdiff_t(start));
+
+  detail::householder_vector(tmp, start);
+  fast_copy(tmp, D);
+}
+
+} //detail
+}
+}
+
+#endif


[46/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/compressed_compressed_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/compressed_compressed_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/compressed_compressed_matrix.hpp
new file mode 100644
index 0000000..f1719a2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/compressed_compressed_matrix.hpp
@@ -0,0 +1,619 @@
+#ifndef VIENNACL_COMPRESSED_compressed_compressed_matrix_HPP_
+#define VIENNACL_COMPRESSED_compressed_compressed_matrix_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/compressed_compressed_matrix.hpp
+    @brief Implementation of the compressed_compressed_matrix class (CSR format with a relatively small number of nonzero rows)
+*/
+
+#include <vector>
+#include <list>
+#include <map>
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/tools/entry_proxy.hpp"
+
+namespace viennacl
+{
+namespace detail
+{
+  template<typename CPUMatrixT, typename NumericT>
+  void copy_impl(const CPUMatrixT & cpu_matrix,
+                 compressed_compressed_matrix<NumericT> & gpu_matrix,
+                 vcl_size_t nonzero_rows,
+                 vcl_size_t nonzeros)
+  {
+    assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+    assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+    viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), nonzero_rows + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> row_indices(gpu_matrix.handle3(), nonzero_rows);
+    viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), nonzeros);
+    std::vector<NumericT> elements(nonzeros);
+
+    vcl_size_t row_index  = 0;
+    vcl_size_t data_index = 0;
+
+    for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1();
+         row_it != cpu_matrix.end1();
+         ++row_it)
+    {
+      bool row_empty = true;
+
+      for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin();
+           col_it != row_it.end();
+           ++col_it)
+      {
+        NumericT entry = *col_it;
+        if (entry < 0 || entry > 0)  // entry != 0 without compiler warnings
+        {
+          if (row_empty)
+          {
+            assert(row_index < nonzero_rows && bool("Provided count of nonzero rows exceeded!"));
+
+            row_empty = false;
+            row_buffer.set(row_index, data_index);
+            row_indices.set(row_index, col_it.index1());
+            ++row_index;
+          }
+
+          col_buffer.set(data_index, col_it.index2());
+          elements[data_index] = entry;
+          ++data_index;
+        }
+      }
+    }
+    row_buffer.set(row_index, data_index);
+
+    gpu_matrix.set(row_buffer.get(),
+                   row_indices.get(),
+                   col_buffer.get(),
+                   &elements[0],
+        cpu_matrix.size1(),
+        cpu_matrix.size2(),
+        nonzero_rows,
+        nonzeros);
+  }
+}
+
+//provide copy-operation:
+/** @brief Copies a sparse matrix from the host to the OpenCL device (either GPU or multi-core CPU)
+  *
+  * There are some type requirements on the CPUMatrixT type (fulfilled by e.g. boost::numeric::ublas):
+  * - .size1() returns the number of rows
+  * - .size2() returns the number of columns
+  * - const_iterator1    is a type definition for an iterator along increasing row indices
+  * - const_iterator2    is a type definition for an iterator along increasing columns indices
+  * - The const_iterator1 type provides an iterator of type const_iterator2 via members .begin() and .end() that iterates along column indices in the current row.
+  * - The types const_iterator1 and const_iterator2 provide members functions .index1() and .index2() that return the current row and column indices respectively.
+  * - Dereferenciation of an object of type const_iterator2 returns the entry.
+  *
+  * @param cpu_matrix   A sparse matrix on the host.
+  * @param gpu_matrix   A compressed_compressed_matrix from ViennaCL
+  */
+template<typename CPUMatrixT, typename NumericT>
+void copy(const CPUMatrixT & cpu_matrix,
+          compressed_compressed_matrix<NumericT> & gpu_matrix )
+{
+  //std::cout << "copy for (" << cpu_matrix.size1() << ", " << cpu_matrix.size2() << ", " << cpu_matrix.nnz() << ")" << std::endl;
+
+  if ( cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0 )
+  {
+    //determine nonzero rows and total nonzeros:
+    vcl_size_t num_entries = 0;
+    vcl_size_t nonzero_rows = 0;
+    for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1();
+         row_it != cpu_matrix.end1();
+         ++row_it)
+    {
+      bool row_empty = true;
+      for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin();
+           col_it != row_it.end();
+           ++col_it)
+      {
+        NumericT val = *col_it;
+        if (val < 0 || val > 0) // val != 0 without compiler warnings
+        {
+          ++num_entries;
+
+          if (row_empty)
+          {
+            row_empty = false;
+            ++nonzero_rows;
+          }
+        }
+      }
+    }
+
+    if (num_entries == 0) //we copy an empty matrix
+      num_entries = 1;
+
+    //set up matrix entries:
+    viennacl::detail::copy_impl(cpu_matrix, gpu_matrix, nonzero_rows, num_entries);
+  }
+}
+
+
+//adapted for std::vector< std::map < > > argument:
+/** @brief Copies a sparse square matrix in the std::vector< std::map < > > format to an OpenCL device. Use viennacl::tools::sparse_matrix_adapter for non-square matrices.
+  *
+  * @param cpu_matrix   A sparse square matrix on the host using STL types
+  * @param gpu_matrix   A compressed_compressed_matrix from ViennaCL
+  */
+template<typename SizeT, typename NumericT>
+void copy(const std::vector< std::map<SizeT, NumericT> > & cpu_matrix,
+          compressed_compressed_matrix<NumericT> & gpu_matrix )
+{
+  vcl_size_t nonzero_rows = 0;
+  vcl_size_t nonzeros = 0;
+  vcl_size_t max_col = 0;
+  for (vcl_size_t i=0; i<cpu_matrix.size(); ++i)
+  {
+    if (cpu_matrix[i].size() > 0)
+      ++nonzero_rows;
+    nonzeros += cpu_matrix[i].size();
+    if (cpu_matrix[i].size() > 0)
+      max_col = std::max<vcl_size_t>(max_col, (cpu_matrix[i].rbegin())->first);
+  }
+
+  viennacl::detail::copy_impl(tools::const_sparse_matrix_adapter<NumericT, SizeT>(cpu_matrix, cpu_matrix.size(), max_col + 1),
+                              gpu_matrix,
+                              nonzero_rows,
+                              nonzeros);
+}
+
+
+//
+// gpu to cpu:
+//
+/** @brief Copies a sparse matrix from the OpenCL device (either GPU or multi-core CPU) to the host.
+  *
+  * There are two type requirements on the CPUMatrixT type (fulfilled by e.g. boost::numeric::ublas):
+  * - resize(rows, cols)  A resize function to bring the matrix into the correct size
+  * - operator(i,j)       Write new entries via the parenthesis operator
+  *
+  * @param gpu_matrix   A compressed_compressed_matrix from ViennaCL
+  * @param cpu_matrix   A sparse matrix on the host.
+  */
+template<typename CPUMatrixT, typename NumericT>
+void copy(const compressed_compressed_matrix<NumericT> & gpu_matrix,
+          CPUMatrixT & cpu_matrix )
+{
+  assert( (cpu_matrix.size1() == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (cpu_matrix.size2() == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+  {
+    //get raw data from memory:
+    viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), gpu_matrix.nnz1() + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> row_indices(gpu_matrix.handle1(), gpu_matrix.nnz1());
+    viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+    std::vector<NumericT> elements(gpu_matrix.nnz());
+
+    //std::cout << "GPU->CPU, nonzeros: " << gpu_matrix.nnz() << std::endl;
+
+    viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+    viennacl::backend::memory_read(gpu_matrix.handle3(), 0, row_indices.raw_size(), row_indices.get());
+    viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+    viennacl::backend::memory_read(gpu_matrix.handle(),  0, sizeof(NumericT)* gpu_matrix.nnz(), &(elements[0]));
+
+    //fill the cpu_matrix:
+    vcl_size_t data_index = 0;
+    for (vcl_size_t i = 1; i < row_buffer.size(); ++i)
+    {
+      while (data_index < row_buffer[i])
+      {
+        if (col_buffer[data_index] >= gpu_matrix.size2())
+        {
+          std::cerr << "ViennaCL encountered invalid data at colbuffer[" << data_index << "]: " << col_buffer[data_index] << std::endl;
+          return;
+        }
+
+        NumericT val = elements[data_index];
+        if (val < 0 || val > 0) // val != 0 without compiler warning
+          cpu_matrix(row_indices[i-1], col_buffer[data_index]) = val;
+        ++data_index;
+      }
+    }
+  }
+}
+
+
+/** @brief Copies a sparse matrix from an OpenCL device to the host. The host type is the std::vector< std::map < > > format .
+  *
+  * @param gpu_matrix   A compressed_compressed_matrix from ViennaCL
+  * @param cpu_matrix   A sparse matrix on the host.
+  */
+template<typename NumericT>
+void copy(const compressed_compressed_matrix<NumericT> & gpu_matrix,
+          std::vector< std::map<unsigned int, NumericT> > & cpu_matrix)
+{
+  tools::sparse_matrix_adapter<NumericT> temp(cpu_matrix, cpu_matrix.size(), cpu_matrix.size());
+  copy(gpu_matrix, temp);
+}
+
+
+//////////////////////// compressed_compressed_matrix //////////////////////////
+/** @brief A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows carry nonzero entries.
+  *
+  * The difference to the 'standard' CSR format is that there is an additional array 'row_indices' so that the i-th set of indices in the CSR-layout refers to row_indices[i].
+  *
+  * @tparam NumericT    The floating point type (either float or double, checked at compile time)
+  * @tparam AlignmentV     The internal memory size for the entries in each row is given by (size()/AlignmentV + 1) * AlignmentV. AlignmentV must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+  */
+template<class NumericT>
+class compressed_compressed_matrix
+{
+public:
+  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
+  typedef vcl_size_t                                                                                 size_type;
+
+  /** @brief Default construction of a compressed matrix. No memory is allocated */
+  compressed_compressed_matrix() : rows_(0), cols_(0), nonzero_rows_(0), nonzeros_(0) {}
+
+  /** @brief Construction of a compressed matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+      *
+      * @param rows         Number of rows
+      * @param cols         Number of columns
+      * @param nonzero_rows Optional number of nonzero rows for memory preallocation
+      * @param nonzeros     Optional number of nonzeros for memory preallocation
+      * @param ctx          Context in which to create the matrix. Uses the default context if omitted
+      */
+  explicit compressed_compressed_matrix(vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzero_rows = 0, vcl_size_t nonzeros = 0, viennacl::context ctx = viennacl::context())
+    : rows_(rows), cols_(cols), nonzero_rows_(nonzero_rows), nonzeros_(nonzeros)
+  {
+    row_buffer_.switch_active_handle_id(ctx.memory_type());
+    row_indices_.switch_active_handle_id(ctx.memory_type());
+    col_buffer_.switch_active_handle_id(ctx.memory_type());
+    elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+    if (ctx.memory_type() == OPENCL_MEMORY)
+    {
+      row_buffer_.opencl_handle().context(ctx.opencl_context());
+      row_indices_.opencl_handle().context(ctx.opencl_context());
+      col_buffer_.opencl_handle().context(ctx.opencl_context());
+      elements_.opencl_handle().context(ctx.opencl_context());
+    }
+#endif
+    if (rows > 0)
+    {
+      viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (rows + 1), ctx);
+    }
+    if (nonzeros > 0)
+    {
+      viennacl::backend::memory_create(col_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * nonzeros, ctx);
+      viennacl::backend::memory_create(elements_, sizeof(NumericT) * nonzeros, ctx);
+    }
+  }
+
+  /** @brief Construction of a compressed matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+      *
+      * @param rows     Number of rows
+      * @param cols     Number of columns
+      * @param ctx      Context in which to create the matrix
+      */
+  explicit compressed_compressed_matrix(vcl_size_t rows, vcl_size_t cols, viennacl::context ctx)
+    : rows_(rows), cols_(cols), nonzeros_(0)
+  {
+    row_buffer_.switch_active_handle_id(ctx.memory_type());
+    col_buffer_.switch_active_handle_id(ctx.memory_type());
+    elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+    if (ctx.memory_type() == OPENCL_MEMORY)
+    {
+      row_buffer_.opencl_handle().context(ctx.opencl_context());
+      col_buffer_.opencl_handle().context(ctx.opencl_context());
+      elements_.opencl_handle().context(ctx.opencl_context());
+    }
+#endif
+    if (rows > 0)
+    {
+      viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (rows + 1), ctx);
+    }
+  }
+
+  explicit compressed_compressed_matrix(viennacl::context ctx) : rows_(0), cols_(0), nonzero_rows_(0), nonzeros_(0)
+  {
+    row_buffer_.switch_active_handle_id(ctx.memory_type());
+    row_indices_.switch_active_handle_id(ctx.memory_type());
+    col_buffer_.switch_active_handle_id(ctx.memory_type());
+    elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+    if (ctx.memory_type() == OPENCL_MEMORY)
+    {
+      row_buffer_.opencl_handle().context(ctx.opencl_context());
+      row_indices_.opencl_handle().context(ctx.opencl_context());
+      col_buffer_.opencl_handle().context(ctx.opencl_context());
+      elements_.opencl_handle().context(ctx.opencl_context());
+    }
+#endif
+  }
+
+
+#ifdef VIENNACL_WITH_OPENCL
+  explicit compressed_compressed_matrix(cl_mem mem_row_buffer, cl_mem mem_row_indices, cl_mem mem_col_buffer, cl_mem mem_elements,
+                                        vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzero_rows, vcl_size_t nonzeros) :
+    rows_(rows), cols_(cols), nonzero_rows_(nonzero_rows), nonzeros_(nonzeros)
+  {
+    row_buffer_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+    row_buffer_.opencl_handle() = mem_row_buffer;
+    row_buffer_.opencl_handle().inc();             //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+    row_buffer_.raw_size(sizeof(cl_uint) * (nonzero_rows + 1));
+
+    row_indices_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+    row_indices_.opencl_handle() = mem_row_indices;
+    row_indices_.opencl_handle().inc();             //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+    row_indices_.raw_size(sizeof(cl_uint) * nonzero_rows);
+
+    col_buffer_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+    col_buffer_.opencl_handle() = mem_col_buffer;
+    col_buffer_.opencl_handle().inc();             //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+    col_buffer_.raw_size(sizeof(cl_uint) * nonzeros);
+
+    elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+    elements_.opencl_handle() = mem_elements;
+    elements_.opencl_handle().inc();               //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+    elements_.raw_size(sizeof(NumericT) * nonzeros);
+  }
+#endif
+
+
+  /** @brief Assignment a compressed matrix from possibly another memory domain. */
+  compressed_compressed_matrix & operator=(compressed_compressed_matrix const & other)
+  {
+    assert( (rows_ == 0 || rows_ == other.size1()) && bool("Size mismatch") );
+    assert( (cols_ == 0 || cols_ == other.size2()) && bool("Size mismatch") );
+
+    rows_ = other.size1();
+    cols_ = other.size2();
+    nonzero_rows_ = other.nnz1();
+    nonzeros_ = other.nnz();
+
+    viennacl::backend::typesafe_memory_copy<unsigned int>(other.row_buffer_,  row_buffer_);
+    viennacl::backend::typesafe_memory_copy<unsigned int>(other.row_indices_, row_indices_);
+    viennacl::backend::typesafe_memory_copy<unsigned int>(other.col_buffer_,  col_buffer_);
+    viennacl::backend::typesafe_memory_copy<NumericT>(other.elements_, elements_);
+
+    return *this;
+  }
+
+
+  /** @brief Sets the row, column and value arrays of the compressed matrix
+      *
+      * @param row_jumper     Pointer to an array holding the indices of the first element of each row (starting with zero). E.g. row_jumper[10] returns the index of the first entry of the 11th row. The array length is 'cols + 1'
+      * @param row_indices    Array holding the indices of the nonzero rows
+      * @param col_buffer     Pointer to an array holding the column index of each entry. The array length is 'nonzeros'
+      * @param elements       Pointer to an array holding the entries of the sparse matrix. The array length is 'elements'
+      * @param rows           Number of rows of the sparse matrix
+      * @param cols           Number of columns of the sparse matrix
+      * @param nonzero_rows   Number of nonzero rows
+      * @param nonzeros       Total number of nonzero entries
+      */
+  void set(const void * row_jumper,
+           const void * row_indices,
+           const void * col_buffer,
+           const NumericT * elements,
+           vcl_size_t rows,
+           vcl_size_t cols,
+           vcl_size_t nonzero_rows,
+           vcl_size_t nonzeros)
+  {
+    assert( (rows > 0)         && bool("Error in compressed_compressed_matrix::set(): Number of rows must be larger than zero!"));
+    assert( (cols > 0)         && bool("Error in compressed_compressed_matrix::set(): Number of columns must be larger than zero!"));
+    assert( (nonzero_rows > 0) && bool("Error in compressed_compressed_matrix::set(): Number of nonzero rows must be larger than zero!"));
+    assert( (nonzeros > 0)     && bool("Error in compressed_compressed_matrix::set(): Number of nonzeros must be larger than zero!"));
+    //std::cout << "Setting memory: " << cols + 1 << ", " << nonzeros << std::endl;
+
+    viennacl::backend::memory_create(row_buffer_,  viennacl::backend::typesafe_host_array<unsigned int>(row_buffer_).element_size() * (nonzero_rows + 1),  viennacl::traits::context(row_buffer_),  row_jumper);
+    viennacl::backend::memory_create(row_indices_, viennacl::backend::typesafe_host_array<unsigned int>(row_indices_).element_size() * nonzero_rows, viennacl::traits::context(row_indices_), row_indices);
+    viennacl::backend::memory_create(col_buffer_,  viennacl::backend::typesafe_host_array<unsigned int>(col_buffer_).element_size() * nonzeros,    viennacl::traits::context(col_buffer_),  col_buffer);
+    viennacl::backend::memory_create(elements_, sizeof(NumericT) * nonzeros, viennacl::traits::context(elements_), elements);
+
+    nonzeros_ = nonzeros;
+    nonzero_rows_ = nonzero_rows;
+    rows_ = rows;
+    cols_ = cols;
+  }
+
+  /** @brief Resets all entries in the matrix back to zero without changing the matrix size. Resets the sparsity pattern. */
+  void clear()
+  {
+    viennacl::backend::typesafe_host_array<unsigned int> host_row_buffer(row_buffer_, rows_ + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> host_row_indices(row_indices_, rows_ + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> host_col_buffer(col_buffer_, 1);
+    std::vector<NumericT> host_elements(1);
+
+    viennacl::backend::memory_create(row_buffer_,  host_row_buffer.element_size() * (rows_ + 1),  viennacl::traits::context(row_buffer_),  host_row_buffer.get());
+    viennacl::backend::memory_create(row_indices_, host_row_indices.element_size() * (rows_ + 1), viennacl::traits::context(row_indices_), host_row_indices.get());
+    viennacl::backend::memory_create(col_buffer_,  host_col_buffer.element_size() * 1,            viennacl::traits::context(col_buffer_),  host_col_buffer.get());
+    viennacl::backend::memory_create(elements_,    sizeof(NumericT) * 1,                          viennacl::traits::context(elements_),    &(host_elements[0]));
+
+    nonzeros_ = 0;
+    nonzero_rows_ = 0;
+  }
+
+  /** @brief  Returns the number of rows */
+  const vcl_size_t & size1() const { return rows_; }
+  /** @brief  Returns the number of columns */
+  const vcl_size_t & size2() const { return cols_; }
+  /** @brief  Returns the number of nonzero entries */
+  const vcl_size_t & nnz1() const { return nonzero_rows_; }
+  /** @brief  Returns the number of nonzero entries */
+  const vcl_size_t & nnz() const { return nonzeros_; }
+
+  /** @brief  Returns the OpenCL handle to the row index array */
+  const handle_type & handle1() const { return row_buffer_; }
+  /** @brief  Returns the OpenCL handle to the column index array */
+  const handle_type & handle2() const { return col_buffer_; }
+  /** @brief  Returns the OpenCL handle to the row index array */
+  const handle_type & handle3() const { return row_indices_; }
+  /** @brief  Returns the OpenCL handle to the matrix entry array */
+  const handle_type & handle() const { return elements_; }
+
+  /** @brief  Returns the OpenCL handle to the row index array */
+  handle_type & handle1() { return row_buffer_; }
+  /** @brief  Returns the OpenCL handle to the column index array */
+  handle_type & handle2() { return col_buffer_; }
+  /** @brief  Returns the OpenCL handle to the row index array */
+  handle_type & handle3() { return row_indices_; }
+  /** @brief  Returns the OpenCL handle to the matrix entry array */
+  handle_type & handle() { return elements_; }
+
+  void switch_memory_context(viennacl::context new_ctx)
+  {
+    viennacl::backend::switch_memory_context<unsigned int>(row_buffer_, new_ctx);
+    viennacl::backend::switch_memory_context<unsigned int>(row_indices_, new_ctx);
+    viennacl::backend::switch_memory_context<unsigned int>(col_buffer_, new_ctx);
+    viennacl::backend::switch_memory_context<NumericT>(elements_, new_ctx);
+  }
+
+  viennacl::memory_types memory_context() const
+  {
+    return row_buffer_.get_active_handle_id();
+  }
+
+private:
+
+  vcl_size_t rows_;
+  vcl_size_t cols_;
+  vcl_size_t nonzero_rows_;
+  vcl_size_t nonzeros_;
+  handle_type row_buffer_;
+  handle_type row_indices_;
+  handle_type col_buffer_;
+  handle_type elements_;
+};
+
+
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+  // x = A * y
+  template<typename T>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x = A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs = temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(0));
+    }
+  };
+
+  template<typename T>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x += A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs += temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(1));
+    }
+  };
+
+  template<typename T>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x -= A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs -= temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(-1), lhs, T(1));
+    }
+  };
+
+
+  // x = A * vec_op
+  template<typename T, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const compressed_compressed_matrix<T>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs());
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const compressed_compressed_matrix<T>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs += temp_result;
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const compressed_compressed_matrix<T>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs -= temp_result;
+    }
+  };
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/compressed_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/compressed_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/compressed_matrix.hpp
new file mode 100644
index 0000000..e42f552
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/compressed_matrix.hpp
@@ -0,0 +1,1178 @@
+#ifndef VIENNACL_COMPRESSED_MATRIX_HPP_
+#define VIENNACL_COMPRESSED_MATRIX_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/compressed_matrix.hpp
+    @brief Implementation of the compressed_matrix class
+*/
+
+#include <vector>
+#include <list>
+#include <map>
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/tools/entry_proxy.hpp"
+
+#ifdef VIENNACL_WITH_UBLAS
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#endif
+
+namespace viennacl
+{
+namespace detail
+{
+
+  /** @brief Implementation of the copy of a host-based sparse matrix to the device.
+    *
+    * See convenience copy() routines for type requirements of CPUMatrixT
+    */
+  template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+  void copy_impl(const CPUMatrixT & cpu_matrix,
+                 compressed_matrix<NumericT, AlignmentV> & gpu_matrix,
+                 vcl_size_t nonzeros)
+  {
+    assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+    assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+    viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), cpu_matrix.size1() + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), nonzeros);
+    std::vector<NumericT> elements(nonzeros);
+
+    vcl_size_t row_index  = 0;
+    vcl_size_t data_index = 0;
+
+    for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1();
+         row_it != cpu_matrix.end1();
+         ++row_it)
+    {
+      row_buffer.set(row_index, data_index);
+      ++row_index;
+
+      for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin();
+           col_it != row_it.end();
+           ++col_it)
+      {
+        col_buffer.set(data_index, col_it.index2());
+        elements[data_index] = *col_it;
+        ++data_index;
+      }
+      data_index = viennacl::tools::align_to_multiple<vcl_size_t>(data_index, AlignmentV); //take care of alignment
+    }
+    row_buffer.set(row_index, data_index);
+
+    gpu_matrix.set(row_buffer.get(),
+                   col_buffer.get(),
+                   &elements[0],
+        cpu_matrix.size1(),
+        cpu_matrix.size2(),
+        nonzeros);
+  }
+}
+
+//
+// host to device:
+//
+
+//provide copy-operation:
+/** @brief Copies a sparse matrix from the host to the OpenCL device (either GPU or multi-core CPU)
+  *
+  * There are some type requirements on the CPUMatrixT type (fulfilled by e.g. boost::numeric::ublas):
+  * - .size1() returns the number of rows
+  * - .size2() returns the number of columns
+  * - const_iterator1    is a type definition for an iterator along increasing row indices
+  * - const_iterator2    is a type definition for an iterator along increasing columns indices
+  * - The const_iterator1 type provides an iterator of type const_iterator2 via members .begin() and .end() that iterates along column indices in the current row.
+  * - The types const_iterator1 and const_iterator2 provide members functions .index1() and .index2() that return the current row and column indices respectively.
+  * - Dereferenciation of an object of type const_iterator2 returns the entry.
+  *
+  * @param cpu_matrix   A sparse matrix on the host.
+  * @param gpu_matrix   A compressed_matrix from ViennaCL
+  */
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const CPUMatrixT & cpu_matrix,
+          compressed_matrix<NumericT, AlignmentV> & gpu_matrix )
+{
+  if ( cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0 )
+  {
+    //determine nonzeros:
+    vcl_size_t num_entries = 0;
+    for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1();
+         row_it != cpu_matrix.end1();
+         ++row_it)
+    {
+      vcl_size_t entries_per_row = 0;
+      for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin();
+           col_it != row_it.end();
+           ++col_it)
+      {
+        ++entries_per_row;
+      }
+      num_entries += viennacl::tools::align_to_multiple<vcl_size_t>(entries_per_row, AlignmentV);
+    }
+
+    if (num_entries == 0) //we copy an empty matrix
+      num_entries = 1;
+
+    //set up matrix entries:
+    viennacl::detail::copy_impl(cpu_matrix, gpu_matrix, num_entries);
+  }
+}
+
+
+//adapted for std::vector< std::map < > > argument:
+/** @brief Copies a sparse square matrix in the std::vector< std::map < > > format to an OpenCL device. Use viennacl::tools::sparse_matrix_adapter for non-square matrices.
+  *
+  * @param cpu_matrix   A sparse square matrix on the host using STL types
+  * @param gpu_matrix   A compressed_matrix from ViennaCL
+  */
+template<typename SizeT, typename NumericT, unsigned int AlignmentV>
+void copy(const std::vector< std::map<SizeT, NumericT> > & cpu_matrix,
+          compressed_matrix<NumericT, AlignmentV> & gpu_matrix )
+{
+  vcl_size_t nonzeros = 0;
+  vcl_size_t max_col = 0;
+  for (vcl_size_t i=0; i<cpu_matrix.size(); ++i)
+  {
+    if (cpu_matrix[i].size() > 0)
+      nonzeros += ((cpu_matrix[i].size() - 1) / AlignmentV + 1) * AlignmentV;
+    if (cpu_matrix[i].size() > 0)
+      max_col = std::max<vcl_size_t>(max_col, (cpu_matrix[i].rbegin())->first);
+  }
+
+  viennacl::detail::copy_impl(tools::const_sparse_matrix_adapter<NumericT, SizeT>(cpu_matrix, cpu_matrix.size(), max_col + 1),
+                              gpu_matrix,
+                              nonzeros);
+}
+
+#ifdef VIENNACL_WITH_UBLAS
+/** @brief Convenience routine for copying a sparse uBLAS matrix to a ViennaCL matrix.
+  *
+  * Optimization which copies the data directly from the internal uBLAS buffers.
+  */
+template<typename ScalarType, typename F, vcl_size_t IB, typename IA, typename TA>
+void copy(const boost::numeric::ublas::compressed_matrix<ScalarType, F, IB, IA, TA> & ublas_matrix,
+          viennacl::compressed_matrix<ScalarType, 1> & gpu_matrix)
+{
+  assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(ublas_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(ublas_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  //we just need to copy the CSR arrays:
+  viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), ublas_matrix.size1() + 1);
+  for (vcl_size_t i=0; i<=ublas_matrix.size1(); ++i)
+    row_buffer.set(i, ublas_matrix.index1_data()[i]);
+
+  viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), ublas_matrix.nnz());
+  for (vcl_size_t i=0; i<ublas_matrix.nnz(); ++i)
+    col_buffer.set(i, ublas_matrix.index2_data()[i]);
+
+  gpu_matrix.set(row_buffer.get(),
+                 col_buffer.get(),
+                 &(ublas_matrix.value_data()[0]),
+      ublas_matrix.size1(),
+      ublas_matrix.size2(),
+      ublas_matrix.nnz());
+
+}
+#endif
+
+#ifdef VIENNACL_WITH_ARMADILLO
+/** @brief Convenience routine for copying a sparse Armadillo matrix to a ViennaCL matrix.
+  *
+  * Since Armadillo uses a column-major format, while ViennaCL uses row-major, we need to transpose.
+  * This is done fairly efficiently working on the CSR arrays directly, rather than (slowly) building an STL matrix.
+  */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(arma::SpMat<NumericT> const & arma_matrix,
+          viennacl::compressed_matrix<NumericT, AlignmentV> & vcl_matrix)
+{
+  assert( (vcl_matrix.size1() == 0 || static_cast<vcl_size_t>(arma_matrix.n_rows) == vcl_matrix.size1()) && bool("Size mismatch") );
+  assert( (vcl_matrix.size2() == 0 || static_cast<vcl_size_t>(arma_matrix.n_cols) == vcl_matrix.size2()) && bool("Size mismatch") );
+
+  viennacl::backend::typesafe_host_array<unsigned int> row_buffer(vcl_matrix.handle1(), arma_matrix.n_rows + 1);
+  viennacl::backend::typesafe_host_array<unsigned int> col_buffer(vcl_matrix.handle2(), arma_matrix.n_nonzero);
+  viennacl::backend::typesafe_host_array<NumericT    > value_buffer(vcl_matrix.handle(), arma_matrix.n_nonzero);
+
+  // Step 1: Count number of nonzeros in each row
+  for (vcl_size_t col=0; col < static_cast<vcl_size_t>(arma_matrix.n_cols); ++col)
+  {
+    vcl_size_t col_begin = static_cast<vcl_size_t>(arma_matrix.col_ptrs[col]);
+    vcl_size_t col_end   = static_cast<vcl_size_t>(arma_matrix.col_ptrs[col+1]);
+    for (vcl_size_t i = col_begin; i < col_end; ++i)
+    {
+      unsigned int row = arma_matrix.row_indices[i];
+      row_buffer.set(row, row_buffer[row] + 1);
+    }
+  }
+
+  // Step 2: Exclusive scan on row_buffer to obtain offsets
+  unsigned int offset = 0;
+  for (vcl_size_t i=0; i<row_buffer.size(); ++i)
+  {
+    unsigned int tmp = row_buffer[i];
+    row_buffer.set(i, offset);
+    offset += tmp;
+  }
+
+  // Step 3: Fill data
+  std::vector<unsigned int> row_offsets(arma_matrix.n_rows);
+  for (vcl_size_t col=0; col < static_cast<vcl_size_t>(arma_matrix.n_cols); ++col)
+  {
+    vcl_size_t col_begin = static_cast<vcl_size_t>(arma_matrix.col_ptrs[col]);
+    vcl_size_t col_end   = static_cast<vcl_size_t>(arma_matrix.col_ptrs[col+1]);
+    for (vcl_size_t i = col_begin; i < col_end; ++i)
+    {
+      unsigned int row = arma_matrix.row_indices[i];
+      col_buffer.set(row_buffer[row] + row_offsets[row], col);
+      value_buffer.set(row_buffer[row] + row_offsets[row], arma_matrix.values[i]);
+      row_offsets[row] += 1;
+    }
+  }
+
+  vcl_matrix.set(row_buffer.get(), col_buffer.get(), reinterpret_cast<NumericT*>(value_buffer.get()),
+                 arma_matrix.n_rows, arma_matrix.n_cols, arma_matrix.n_nonzero);
+}
+#endif
+
+#ifdef VIENNACL_WITH_EIGEN
+/** @brief Convenience routine for copying a sparse Eigen matrix to a ViennaCL matrix.
+  *
+  * Builds a temporary STL matrix. Patches for avoiding the temporary matrix welcome.
+  */
+template<typename NumericT, int flags, unsigned int AlignmentV>
+void copy(const Eigen::SparseMatrix<NumericT, flags> & eigen_matrix,
+          compressed_matrix<NumericT, AlignmentV> & gpu_matrix)
+{
+  assert( (gpu_matrix.size1() == 0 || static_cast<vcl_size_t>(eigen_matrix.rows()) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (gpu_matrix.size2() == 0 || static_cast<vcl_size_t>(eigen_matrix.cols()) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  std::vector< std::map<unsigned int, NumericT> >  stl_matrix(eigen_matrix.rows());
+
+  for (int k=0; k < eigen_matrix.outerSize(); ++k)
+    for (typename Eigen::SparseMatrix<NumericT, flags>::InnerIterator it(eigen_matrix, k); it; ++it)
+      stl_matrix[it.row()][it.col()] = it.value();
+
+  copy(tools::const_sparse_matrix_adapter<NumericT>(stl_matrix, eigen_matrix.rows(), eigen_matrix.cols()), gpu_matrix);
+}
+#endif
+
+
+#ifdef VIENNACL_WITH_MTL4
+/** @brief Convenience routine for copying a sparse MTL4 matrix to a ViennaCL matrix.
+  *
+  * Builds a temporary STL matrix for the copy. Patches for avoiding the temporary matrix welcome.
+  */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(const mtl::compressed2D<NumericT> & cpu_matrix,
+          compressed_matrix<NumericT, AlignmentV> & gpu_matrix)
+{
+  assert( (gpu_matrix.size1() == 0 || static_cast<vcl_size_t>(cpu_matrix.num_rows()) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (gpu_matrix.size2() == 0 || static_cast<vcl_size_t>(cpu_matrix.num_cols()) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  typedef mtl::compressed2D<NumericT>  MatrixType;
+
+  std::vector< std::map<unsigned int, NumericT> >  stl_matrix(cpu_matrix.num_rows());
+
+  using mtl::traits::range_generator;
+  using mtl::traits::range::min;
+
+  // Choose between row and column traversal
+  typedef typename min<range_generator<mtl::tag::row, MatrixType>,
+      range_generator<mtl::tag::col, MatrixType> >::type   range_type;
+  range_type                                                      my_range;
+
+  // Type of outer cursor
+  typedef typename range_type::type                               c_type;
+  // Type of inner cursor
+  typedef typename mtl::traits::range_generator<mtl::tag::nz, c_type>::type ic_type;
+
+  // Define the property maps
+  typename mtl::traits::row<MatrixType>::type                              row(cpu_matrix);
+  typename mtl::traits::col<MatrixType>::type                              col(cpu_matrix);
+  typename mtl::traits::const_value<MatrixType>::type                      value(cpu_matrix);
+
+  // Now iterate over the matrix
+  for (c_type cursor(my_range.begin(cpu_matrix)), cend(my_range.end(cpu_matrix)); cursor != cend; ++cursor)
+    for (ic_type icursor(mtl::begin<mtl::tag::nz>(cursor)), icend(mtl::end<mtl::tag::nz>(cursor)); icursor != icend; ++icursor)
+      stl_matrix[row(*icursor)][col(*icursor)] = value(*icursor);
+
+  copy(tools::const_sparse_matrix_adapter<NumericT>(stl_matrix, cpu_matrix.num_rows(), cpu_matrix.num_cols()), gpu_matrix);
+}
+#endif
+
+
+
+
+
+
+
+//
+// device to host:
+//
+/** @brief Copies a sparse matrix from the OpenCL device (either GPU or multi-core CPU) to the host.
+  *
+  * There are two type requirements on the CPUMatrixT type (fulfilled by e.g. boost::numeric::ublas):
+  * - resize(rows, cols)  A resize function to bring the matrix into the correct size
+  * - operator(i,j)       Write new entries via the parenthesis operator
+  *
+  * @param gpu_matrix   A compressed_matrix from ViennaCL
+  * @param cpu_matrix   A sparse matrix on the host.
+  */
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const compressed_matrix<NumericT, AlignmentV> & gpu_matrix,
+          CPUMatrixT & cpu_matrix )
+{
+  assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+  {
+    //get raw data from memory:
+    viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), cpu_matrix.size1() + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+    std::vector<NumericT> elements(gpu_matrix.nnz());
+
+    //std::cout << "GPU->CPU, nonzeros: " << gpu_matrix.nnz() << std::endl;
+
+    viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+    viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+    viennacl::backend::memory_read(gpu_matrix.handle(),  0, sizeof(NumericT)* gpu_matrix.nnz(), &(elements[0]));
+
+    //fill the cpu_matrix:
+    vcl_size_t data_index = 0;
+    for (vcl_size_t row = 1; row <= gpu_matrix.size1(); ++row)
+    {
+      while (data_index < row_buffer[row])
+      {
+        if (col_buffer[data_index] >= gpu_matrix.size2())
+        {
+          std::cerr << "ViennaCL encountered invalid data at colbuffer[" << data_index << "]: " << col_buffer[data_index] << std::endl;
+          return;
+        }
+
+        if (std::fabs(elements[data_index]) > static_cast<NumericT>(0))
+          cpu_matrix(row-1, static_cast<vcl_size_t>(col_buffer[data_index])) = elements[data_index];
+        ++data_index;
+      }
+    }
+  }
+}
+
+
+/** @brief Copies a sparse matrix from an OpenCL device to the host. The host type is the std::vector< std::map < > > format .
+  *
+  * @param gpu_matrix   A compressed_matrix from ViennaCL
+  * @param cpu_matrix   A sparse matrix on the host.
+  */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(const compressed_matrix<NumericT, AlignmentV> & gpu_matrix,
+          std::vector< std::map<unsigned int, NumericT> > & cpu_matrix)
+{
+  assert( (cpu_matrix.size() == gpu_matrix.size1()) && bool("Size mismatch") );
+
+  tools::sparse_matrix_adapter<NumericT> temp(cpu_matrix, gpu_matrix.size1(), gpu_matrix.size2());
+  copy(gpu_matrix, temp);
+}
+
+#ifdef VIENNACL_WITH_UBLAS
+/** @brief Convenience routine for copying a ViennaCL sparse matrix back to a sparse uBLAS matrix
+  *
+  * Directly populates the internal buffer of the uBLAS matrix, thus avoiding a temporary STL matrix.
+  */
+template<typename ScalarType, unsigned int AlignmentV, typename F, vcl_size_t IB, typename IA, typename TA>
+void copy(viennacl::compressed_matrix<ScalarType, AlignmentV> const & gpu_matrix,
+          boost::numeric::ublas::compressed_matrix<ScalarType> & ublas_matrix)
+{
+  assert( (viennacl::traits::size1(ublas_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (viennacl::traits::size2(ublas_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), gpu_matrix.size1() + 1);
+  viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+
+  viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+  viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+
+  ublas_matrix.clear();
+  ublas_matrix.reserve(gpu_matrix.nnz());
+
+  ublas_matrix.set_filled(gpu_matrix.size1() + 1, gpu_matrix.nnz());
+
+  for (vcl_size_t i=0; i<ublas_matrix.size1() + 1; ++i)
+    ublas_matrix.index1_data()[i] = row_buffer[i];
+
+  for (vcl_size_t i=0; i<ublas_matrix.nnz(); ++i)
+    ublas_matrix.index2_data()[i] = col_buffer[i];
+
+  viennacl::backend::memory_read(gpu_matrix.handle(),  0, sizeof(ScalarType) * gpu_matrix.nnz(), &(ublas_matrix.value_data()[0]));
+
+}
+#endif
+
+#ifdef VIENNACL_WITH_ARMADILLO
+/** @brief Convenience routine for copying a ViennaCL sparse matrix back to a sparse Armadillo matrix.
+ *
+ * Performance notice: Inserting the row-major data from the ViennaCL matrix to the column-major Armadillo-matrix is likely to be slow.
+ * However, since this operation is unlikely to be performance-critical, further optimizations are postponed.
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void copy(viennacl::compressed_matrix<NumericT, AlignmentV> & vcl_matrix,
+          arma::SpMat<NumericT> & arma_matrix)
+{
+  assert( (static_cast<vcl_size_t>(arma_matrix.n_rows) == vcl_matrix.size1()) && bool("Size mismatch") );
+  assert( (static_cast<vcl_size_t>(arma_matrix.n_cols) == vcl_matrix.size2()) && bool("Size mismatch") );
+
+  if ( vcl_matrix.size1() > 0 && vcl_matrix.size2() > 0 )
+  {
+    //get raw data from memory:
+    viennacl::backend::typesafe_host_array<unsigned int> row_buffer(vcl_matrix.handle1(), vcl_matrix.size1() + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> col_buffer(vcl_matrix.handle2(), vcl_matrix.nnz());
+    viennacl::backend::typesafe_host_array<NumericT>     elements  (vcl_matrix.handle(),  vcl_matrix.nnz());
+
+    viennacl::backend::memory_read(vcl_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+    viennacl::backend::memory_read(vcl_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+    viennacl::backend::memory_read(vcl_matrix.handle(),  0, elements.raw_size(),     elements.get());
+
+    arma_matrix.zeros();
+    vcl_size_t data_index = 0;
+    for (vcl_size_t row = 1; row <= vcl_matrix.size1(); ++row)
+    {
+      while (data_index < row_buffer[row])
+      {
+        assert(col_buffer[data_index] < vcl_matrix.size2() && bool("ViennaCL encountered invalid data at col_buffer"));
+        if (elements[data_index] != static_cast<NumericT>(0.0))
+          arma_matrix(row-1, col_buffer[data_index]) = elements[data_index];
+        ++data_index;
+      }
+    }
+  }
+}
+#endif
+
+#ifdef VIENNACL_WITH_EIGEN
+/** @brief Convenience routine for copying a ViennaCL sparse matrix back to a sparse Eigen matrix */
+template<typename NumericT, int flags, unsigned int AlignmentV>
+void copy(compressed_matrix<NumericT, AlignmentV> & gpu_matrix,
+          Eigen::SparseMatrix<NumericT, flags> & eigen_matrix)
+{
+  assert( (static_cast<vcl_size_t>(eigen_matrix.rows()) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (static_cast<vcl_size_t>(eigen_matrix.cols()) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+  {
+    //get raw data from memory:
+    viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), gpu_matrix.size1() + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+    std::vector<NumericT> elements(gpu_matrix.nnz());
+
+    viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+    viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+    viennacl::backend::memory_read(gpu_matrix.handle(),  0, sizeof(NumericT)* gpu_matrix.nnz(),        &(elements[0]));
+
+    eigen_matrix.setZero();
+    vcl_size_t data_index = 0;
+    for (vcl_size_t row = 1; row <= gpu_matrix.size1(); ++row)
+    {
+      while (data_index < row_buffer[row])
+      {
+        assert(col_buffer[data_index] < gpu_matrix.size2() && bool("ViennaCL encountered invalid data at col_buffer"));
+        if (elements[data_index] != static_cast<NumericT>(0.0))
+          eigen_matrix.insert(row-1, col_buffer[data_index]) = elements[data_index];
+        ++data_index;
+      }
+    }
+  }
+}
+#endif
+
+
+
+#ifdef VIENNACL_WITH_MTL4
+/** @brief Convenience routine for copying a ViennaCL sparse matrix back to a sparse MTL4 matrix */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(compressed_matrix<NumericT, AlignmentV> & gpu_matrix,
+          mtl::compressed2D<NumericT> & mtl4_matrix)
+{
+  assert( (static_cast<vcl_size_t>(mtl4_matrix.num_rows()) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (static_cast<vcl_size_t>(mtl4_matrix.num_cols()) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+  {
+
+    //get raw data from memory:
+    viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), gpu_matrix.size1() + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+    std::vector<NumericT> elements(gpu_matrix.nnz());
+
+    viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+    viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+    viennacl::backend::memory_read(gpu_matrix.handle(),  0, sizeof(NumericT)* gpu_matrix.nnz(), &(elements[0]));
+
+    //set_to_zero(mtl4_matrix);
+    //mtl4_matrix.change_dim(gpu_matrix.size1(), gpu_matrix.size2());
+
+    mtl::matrix::inserter< mtl::compressed2D<NumericT> >  ins(mtl4_matrix);
+    vcl_size_t data_index = 0;
+    for (vcl_size_t row = 1; row <= gpu_matrix.size1(); ++row)
+    {
+      while (data_index < row_buffer[row])
+      {
+        assert(col_buffer[data_index] < gpu_matrix.size2() && bool("ViennaCL encountered invalid data at col_buffer"));
+        if (elements[data_index] != static_cast<NumericT>(0.0))
+          ins(row-1, col_buffer[data_index]) << typename mtl::Collection< mtl::compressed2D<NumericT> >::value_type(elements[data_index]);
+        ++data_index;
+      }
+    }
+  }
+}
+#endif
+
+
+
+
+
+//////////////////////// compressed_matrix //////////////////////////
+/** @brief A sparse square matrix in compressed sparse rows format.
+  *
+  * @tparam NumericT    The floating point type (either float or double, checked at compile time)
+  * @tparam AlignmentV     The internal memory size for the entries in each row is given by (size()/AlignmentV + 1) * AlignmentV. AlignmentV must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+  */
+template<class NumericT, unsigned int AlignmentV /* see VCLForwards.h */>
+class compressed_matrix
+{
+public:
+  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
+  typedef vcl_size_t                                                                                 size_type;
+
+  /** @brief Default construction of a compressed matrix. No memory is allocated */
+  compressed_matrix() : rows_(0), cols_(0), nonzeros_(0), row_block_num_(0) {}
+
+  /** @brief Construction of a compressed matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+      *
+      * @param rows     Number of rows
+      * @param cols     Number of columns
+      * @param nonzeros Optional number of nonzeros for memory preallocation
+      * @param ctx      Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+      */
+  explicit compressed_matrix(vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzeros = 0, viennacl::context ctx = viennacl::context())
+    : rows_(rows), cols_(cols), nonzeros_(nonzeros), row_block_num_(0)
+  {
+    row_buffer_.switch_active_handle_id(ctx.memory_type());
+    col_buffer_.switch_active_handle_id(ctx.memory_type());
+    elements_.switch_active_handle_id(ctx.memory_type());
+    row_blocks_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+    if (ctx.memory_type() == OPENCL_MEMORY)
+    {
+      row_buffer_.opencl_handle().context(ctx.opencl_context());
+      col_buffer_.opencl_handle().context(ctx.opencl_context());
+      elements_.opencl_handle().context(ctx.opencl_context());
+      row_blocks_.opencl_handle().context(ctx.opencl_context());
+    }
+#endif
+    if (rows > 0)
+    {
+      viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (rows + 1), ctx);
+      viennacl::vector_base<unsigned int> init_temporary(row_buffer_, size_type(rows+1), 0, 1);
+      init_temporary = viennacl::zero_vector<unsigned int>(size_type(rows+1), ctx);
+    }
+    if (nonzeros > 0)
+    {
+      viennacl::backend::memory_create(col_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * nonzeros, ctx);
+      viennacl::backend::memory_create(elements_, sizeof(NumericT) * nonzeros, ctx);
+    }
+  }
+
+  /** @brief Construction of a compressed matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+      *
+      * @param rows     Number of rows
+      * @param cols     Number of columns
+      * @param ctx      Context in which to create the matrix
+      */
+  explicit compressed_matrix(vcl_size_t rows, vcl_size_t cols, viennacl::context ctx)
+    : rows_(rows), cols_(cols), nonzeros_(0), row_block_num_(0)
+  {
+    row_buffer_.switch_active_handle_id(ctx.memory_type());
+    col_buffer_.switch_active_handle_id(ctx.memory_type());
+    elements_.switch_active_handle_id(ctx.memory_type());
+    row_blocks_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+    if (ctx.memory_type() == OPENCL_MEMORY)
+    {
+      row_buffer_.opencl_handle().context(ctx.opencl_context());
+      col_buffer_.opencl_handle().context(ctx.opencl_context());
+      elements_.opencl_handle().context(ctx.opencl_context());
+      row_blocks_.opencl_handle().context(ctx.opencl_context());
+    }
+#endif
+    if (rows > 0)
+    {
+      viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (rows + 1), ctx);
+      viennacl::vector_base<unsigned int> init_temporary(row_buffer_, size_type(rows+1), 0, 1);
+      init_temporary = viennacl::zero_vector<unsigned int>(size_type(rows+1), ctx);
+    }
+  }
+
+  /** @brief Creates an empty compressed_matrix, but sets the respective context information.
+    *
+    * This is useful if you want to want to populate e.g. a viennacl::compressed_matrix<> on the host with copy(), but the default backend is OpenCL.
+    */
+  explicit compressed_matrix(viennacl::context ctx) : rows_(0), cols_(0), nonzeros_(0), row_block_num_(0)
+  {
+    row_buffer_.switch_active_handle_id(ctx.memory_type());
+    col_buffer_.switch_active_handle_id(ctx.memory_type());
+    elements_.switch_active_handle_id(ctx.memory_type());
+    row_blocks_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+    if (ctx.memory_type() == OPENCL_MEMORY)
+    {
+      row_buffer_.opencl_handle().context(ctx.opencl_context());
+      col_buffer_.opencl_handle().context(ctx.opencl_context());
+      elements_.opencl_handle().context(ctx.opencl_context());
+      row_blocks_.opencl_handle().context(ctx.opencl_context());
+    }
+#endif
+  }
+
+
+#ifdef VIENNACL_WITH_OPENCL
+  /** @brief Wraps existing OpenCL buffers holding the compressed sparse row information.
+    *
+    * @param mem_row_buffer   A buffer consisting of unsigned integers (cl_uint) holding the entry points for each row (0-based indexing). (rows+1) elements, the last element being 'nonzeros'.
+    * @param mem_col_buffer   A buffer consisting of unsigned integers (cl_uint) holding the column index for each nonzero entry as stored in 'mem_elements'.
+    * @param mem_elements     A buffer holding the floating point numbers for nonzeros. OpenCL type of elements must match the template 'NumericT'.
+    * @param rows             Number of rows in the matrix to be wrapped.
+    * @param cols             Number of columns to be wrapped.
+    * @param nonzeros         Number of nonzero entries in the matrix.
+    */
+  explicit compressed_matrix(cl_mem mem_row_buffer, cl_mem mem_col_buffer, cl_mem mem_elements,
+                             vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzeros) :
+    rows_(rows), cols_(cols), nonzeros_(nonzeros), row_block_num_(0)
+  {
+    row_buffer_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+    row_buffer_.opencl_handle() = mem_row_buffer;
+    row_buffer_.opencl_handle().inc();             //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+    row_buffer_.raw_size(sizeof(cl_uint) * (rows + 1));
+
+    col_buffer_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+    col_buffer_.opencl_handle() = mem_col_buffer;
+    col_buffer_.opencl_handle().inc();             //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+    col_buffer_.raw_size(sizeof(cl_uint) * nonzeros);
+
+    elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+    elements_.opencl_handle() = mem_elements;
+    elements_.opencl_handle().inc();               //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+    elements_.raw_size(sizeof(NumericT) * nonzeros);
+
+    //generate block information for CSR-adaptive:
+    generate_row_block_information();
+  }
+#endif
+
+  /** @brief Assignment a compressed matrix from the product of two compressed_matrix objects (C = A * B). */
+  compressed_matrix(matrix_expression<const compressed_matrix, const compressed_matrix, op_prod> const & proxy)
+    : rows_(0), cols_(0), nonzeros_(0), row_block_num_(0)
+  {
+    viennacl::context ctx = viennacl::traits::context(proxy.lhs());
+
+    row_buffer_.switch_active_handle_id(ctx.memory_type());
+    col_buffer_.switch_active_handle_id(ctx.memory_type());
+    elements_.switch_active_handle_id(ctx.memory_type());
+    row_blocks_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+    if (ctx.memory_type() == OPENCL_MEMORY)
+    {
+      row_buffer_.opencl_handle().context(ctx.opencl_context());
+      col_buffer_.opencl_handle().context(ctx.opencl_context());
+      elements_.opencl_handle().context(ctx.opencl_context());
+      row_blocks_.opencl_handle().context(ctx.opencl_context());
+    }
+#endif
+
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
+    generate_row_block_information();
+  }
+
+  /** @brief Assignment a compressed matrix from possibly another memory domain. */
+  compressed_matrix & operator=(compressed_matrix const & other)
+  {
+    assert( (rows_ == 0 || rows_ == other.size1()) && bool("Size mismatch") );
+    assert( (cols_ == 0 || cols_ == other.size2()) && bool("Size mismatch") );
+
+    rows_ = other.size1();
+    cols_ = other.size2();
+    nonzeros_ = other.nnz();
+    row_block_num_ = other.row_block_num_;
+
+    viennacl::backend::typesafe_memory_copy<unsigned int>(other.row_buffer_, row_buffer_);
+    viennacl::backend::typesafe_memory_copy<unsigned int>(other.col_buffer_, col_buffer_);
+    viennacl::backend::typesafe_memory_copy<unsigned int>(other.row_blocks_, row_blocks_);
+    viennacl::backend::typesafe_memory_copy<NumericT>(other.elements_, elements_);
+
+    return *this;
+  }
+
+  /** @brief Assignment a compressed matrix from the product of two compressed_matrix objects (C = A * B). */
+  compressed_matrix & operator=(matrix_expression<const compressed_matrix, const compressed_matrix, op_prod> const & proxy)
+  {
+    assert( (rows_ == 0 || rows_ == proxy.lhs().size1()) && bool("Size mismatch") );
+    assert( (cols_ == 0 || cols_ == proxy.rhs().size2()) && bool("Size mismatch") );
+
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
+    generate_row_block_information();
+
+    return *this;
+  }
+
+
+  /** @brief Sets the row, column and value arrays of the compressed matrix
+    *
+    * Type of row_jumper and col_buffer is 'unsigned int' for CUDA and OpenMP (host) backend, but *must* be cl_uint for OpenCL.
+    * The reason is that 'unsigned int' might have a different bit representation on the host than 'unsigned int' on the OpenCL device.
+    * cl_uint is guaranteed to have the correct bit representation for OpenCL devices.
+    *
+    * @param row_jumper     Pointer to an array holding the indices of the first element of each row (starting with zero). E.g. row_jumper[10] returns the index of the first entry of the 11th row. The array length is 'cols + 1'
+    * @param col_buffer     Pointer to an array holding the column index of each entry. The array length is 'nonzeros'
+    * @param elements       Pointer to an array holding the entries of the sparse matrix. The array length is 'elements'
+    * @param rows           Number of rows of the sparse matrix
+    * @param cols           Number of columns of the sparse matrix
+    * @param nonzeros       Number of nonzeros
+    */
+  void set(const void * row_jumper,
+           const void * col_buffer,
+           const NumericT * elements,
+           vcl_size_t rows,
+           vcl_size_t cols,
+           vcl_size_t nonzeros)
+  {
+    assert( (rows > 0)     && bool("Error in compressed_matrix::set(): Number of rows must be larger than zero!"));
+    assert( (cols > 0)     && bool("Error in compressed_matrix::set(): Number of columns must be larger than zero!"));
+    assert( (nonzeros > 0) && bool("Error in compressed_matrix::set(): Number of nonzeros must be larger than zero!"));
+    //std::cout << "Setting memory: " << cols + 1 << ", " << nonzeros << std::endl;
+
+    //row_buffer_.switch_active_handle_id(viennacl::backend::OPENCL_MEMORY);
+    viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>(row_buffer_).element_size() * (rows + 1), viennacl::traits::context(row_buffer_), row_jumper);
+
+    //col_buffer_.switch_active_handle_id(viennacl::backend::OPENCL_MEMORY);
+    viennacl::backend::memory_create(col_buffer_, viennacl::backend::typesafe_host_array<unsigned int>(col_buffer_).element_size() * nonzeros, viennacl::traits::context(col_buffer_), col_buffer);
+
+    //elements_.switch_active_handle_id(viennacl::backend::OPENCL_MEMORY);
+    viennacl::backend::memory_create(elements_, sizeof(NumericT) * nonzeros, viennacl::traits::context(elements_), elements);
+
+    nonzeros_ = nonzeros;
+    rows_ = rows;
+    cols_ = cols;
+
+    //generate block information for CSR-adaptive:
+    generate_row_block_information();
+  }
+
+  /** @brief Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved. */
+  void reserve(vcl_size_t new_nonzeros, bool preserve = true)
+  {
+    if (new_nonzeros > nonzeros_)
+    {
+      if (preserve)
+      {
+        handle_type col_buffer_old;
+        handle_type elements_old;
+        viennacl::backend::memory_shallow_copy(col_buffer_, col_buffer_old);
+        viennacl::backend::memory_shallow_copy(elements_,   elements_old);
+
+        viennacl::backend::typesafe_host_array<unsigned int> size_deducer(col_buffer_);
+        viennacl::backend::memory_create(col_buffer_, size_deducer.element_size() * new_nonzeros, viennacl::traits::context(col_buffer_));
+        viennacl::backend::memory_create(elements_,   sizeof(NumericT) * new_nonzeros,          viennacl::traits::context(elements_));
+
+        viennacl::backend::memory_copy(col_buffer_old, col_buffer_, 0, 0, size_deducer.element_size() * nonzeros_);
+        viennacl::backend::memory_copy(elements_old,   elements_,   0, 0, sizeof(NumericT)* nonzeros_);
+      }
+      else
+      {
+        viennacl::backend::typesafe_host_array<unsigned int> size_deducer(col_buffer_);
+        viennacl::backend::memory_create(col_buffer_, size_deducer.element_size() * new_nonzeros, viennacl::traits::context(col_buffer_));
+        viennacl::backend::memory_create(elements_,   sizeof(NumericT)            * new_nonzeros, viennacl::traits::context(elements_));
+      }
+
+      nonzeros_ = new_nonzeros;
+    }
+  }
+
+  /** @brief Resize the matrix.
+      *
+      * @param new_size1    New number of rows
+      * @param new_size2    New number of columns
+      * @param preserve     If true, the old values are preserved. At present, old values are always discarded.
+      */
+  void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve = true)
+  {
+    assert(new_size1 > 0 && new_size2 > 0 && bool("Cannot resize to zero size!"));
+
+    if (new_size1 != rows_ || new_size2 != cols_)
+    {
+      if (!preserve)
+      {
+        viennacl::backend::typesafe_host_array<unsigned int> host_row_buffer(row_buffer_, new_size1 + 1);
+        viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (new_size1 + 1), viennacl::traits::context(row_buffer_), host_row_buffer.get());
+        // faster version without initializing memory:
+        //viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (new_size1 + 1), viennacl::traits::context(row_buffer_));
+        nonzeros_ = 0;
+      }
+      else
+      {
+        std::vector<std::map<unsigned int, NumericT> > stl_sparse_matrix;
+        if (rows_ > 0)
+        {
+          stl_sparse_matrix.resize(rows_);
+          viennacl::copy(*this, stl_sparse_matrix);
+        } else {
+          stl_sparse_matrix.resize(new_size1);
+          stl_sparse_matrix[0][0] = 0;      //enforces nonzero array sizes if matrix was initially empty
+        }
+
+        stl_sparse_matrix.resize(new_size1);
+
+        //discard entries with column index larger than new_size2
+        if (new_size2 < cols_ && rows_ > 0)
+        {
+          for (vcl_size_t i=0; i<stl_sparse_matrix.size(); ++i)
+          {
+            std::list<unsigned int> to_delete;
+            for (typename std::map<unsigned int, NumericT>::iterator it = stl_sparse_matrix[i].begin();
+                 it != stl_sparse_matrix[i].end();
+                 ++it)
+            {
+              if (it->first >= new_size2)
+                to_delete.push_back(it->first);
+            }
+
+            for (std::list<unsigned int>::iterator it = to_delete.begin(); it != to_delete.end(); ++it)
+              stl_sparse_matrix[i].erase(*it);
+          }
+        }
+
+        viennacl::tools::sparse_matrix_adapter<NumericT> adapted_matrix(stl_sparse_matrix, new_size1, new_size2);
+        rows_ = new_size1;
+        cols_ = new_size2;
+        viennacl::copy(adapted_matrix, *this);
+      }
+
+      rows_ = new_size1;
+      cols_ = new_size2;
+    }
+  }
+
+  /** @brief Resets all entries in the matrix back to zero without changing the matrix size. Resets the sparsity pattern. */
+  void clear()
+  {
+    viennacl::backend::typesafe_host_array<unsigned int> host_row_buffer(row_buffer_, rows_ + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> host_col_buffer(col_buffer_, 1);
+    std::vector<NumericT> host_elements(1);
+
+    viennacl::backend::memory_create(row_buffer_, host_row_buffer.element_size() * (rows_ + 1), viennacl::traits::context(row_buffer_), host_row_buffer.get());
+    viennacl::backend::memory_create(col_buffer_, host_col_buffer.element_size() * 1,           viennacl::traits::context(col_buffer_), host_col_buffer.get());
+    viennacl::backend::memory_create(elements_,   sizeof(NumericT) * 1,                         viennacl::traits::context(elements_), &(host_elements[0]));
+
+    nonzeros_ = 0;
+  }
+
+  /** @brief Returns a reference to the (i,j)-th entry of the sparse matrix. If (i,j) does not exist (zero), it is inserted (slow!) */
+  entry_proxy<NumericT> operator()(vcl_size_t i, vcl_size_t j)
+  {
+    assert( (i < rows_) && (j < cols_) && bool("compressed_matrix access out of bounds!"));
+
+    vcl_size_t index = element_index(i, j);
+
+    // check for element in sparsity pattern
+    if (index < nonzeros_)
+      return entry_proxy<NumericT>(index, elements_);
+
+    // Element not found. Copying required. Very slow, but direct entry manipulation is painful anyway...
+    std::vector< std::map<unsigned int, NumericT> > cpu_backup(rows_);
+    tools::sparse_matrix_adapter<NumericT> adapted_cpu_backup(cpu_backup, rows_, cols_);
+    viennacl::copy(*this, adapted_cpu_backup);
+    cpu_backup[i][static_cast<unsigned int>(j)] = 0.0;
+    viennacl::copy(adapted_cpu_backup, *this);
+
+    index = element_index(i, j);
+
+    assert(index < nonzeros_);
+
+    return entry_proxy<NumericT>(index, elements_);
+  }
+
+  /** @brief  Returns the number of rows */
+  const vcl_size_t & size1() const { return rows_; }
+  /** @brief  Returns the number of columns */
+  const vcl_size_t & size2() const { return cols_; }
+  /** @brief  Returns the number of nonzero entries */
+  const vcl_size_t & nnz() const { return nonzeros_; }
+  /** @brief  Returns the internal number of row blocks for an adaptive SpMV */
+  const vcl_size_t & blocks1() const { return row_block_num_; }
+
+  /** @brief  Returns the OpenCL handle to the row index array */
+  const handle_type & handle1() const { return row_buffer_; }
+  /** @brief  Returns the OpenCL handle to the column index array */
+  const handle_type & handle2() const { return col_buffer_; }
+  /** @brief  Returns the OpenCL handle to the row block array */
+  const handle_type & handle3() const { return row_blocks_; }
+  /** @brief  Returns the OpenCL handle to the matrix entry array */
+  const handle_type & handle() const { return elements_; }
+
+  /** @brief  Returns the OpenCL handle to the row index array */
+  handle_type & handle1() { return row_buffer_; }
+  /** @brief  Returns the OpenCL handle to the column index array */
+  handle_type & handle2() { return col_buffer_; }
+  /** @brief  Returns the OpenCL handle to the row block array */
+  handle_type & handle3() { return row_blocks_; }
+  /** @brief  Returns the OpenCL handle to the matrix entry array */
+  handle_type & handle() { return elements_; }
+
+  /** @brief Switches the memory context of the matrix.
+    *
+    * Allows for e.g. an migration of the full matrix from OpenCL memory to host memory for e.g. computing a preconditioner.
+    */
+  void switch_memory_context(viennacl::context new_ctx)
+  {
+    viennacl::backend::switch_memory_context<unsigned int>(row_buffer_, new_ctx);
+    viennacl::backend::switch_memory_context<unsigned int>(col_buffer_, new_ctx);
+    viennacl::backend::switch_memory_context<unsigned int>(row_blocks_, new_ctx);
+    viennacl::backend::switch_memory_context<NumericT>(elements_, new_ctx);
+  }
+
+  /** @brief Returns the current memory context to determine whether the matrix is set up for OpenMP, OpenCL, or CUDA. */
+  viennacl::memory_types memory_context() const
+  {
+    return row_buffer_.get_active_handle_id();
+  }
+
+private:
+
+  /** @brief Helper function for accessing the element (i,j) of the matrix. */
+  vcl_size_t element_index(vcl_size_t i, vcl_size_t j)
+  {
+    //read row indices
+    viennacl::backend::typesafe_host_array<unsigned int> row_indices(row_buffer_, 2);
+    viennacl::backend::memory_read(row_buffer_, row_indices.element_size()*i, row_indices.element_size()*2, row_indices.get());
+
+    //get column indices for row i:
+    viennacl::backend::typesafe_host_array<unsigned int> col_indices(col_buffer_, row_indices[1] - row_indices[0]);
+    viennacl::backend::memory_read(col_buffer_, col_indices.element_size()*row_indices[0], row_indices.element_size()*col_indices.size(), col_indices.get());
+
+    for (vcl_size_t k=0; k<col_indices.size(); ++k)
+    {
+      if (col_indices[k] == j)
+        return row_indices[0] + k;
+    }
+
+    // if not found, return index past the end of the matrix (cf. matrix.end() in the spirit of the STL)
+    return nonzeros_;
+  }
+
+public:
+  /** @brief Builds the row block information needed for fast sparse matrix-vector multiplications.
+   *
+   *  Required when manually populating the memory buffers with values. Not necessary when using viennacl::copy() or .set()
+   */
+  void generate_row_block_information()
+  {
+    viennacl::backend::typesafe_host_array<unsigned int> row_buffer(row_buffer_, rows_ + 1);
+    viennacl::backend::memory_read(row_buffer_, 0, row_buffer.raw_size(), row_buffer.get());
+
+    viennacl::backend::typesafe_host_array<unsigned int> row_blocks(row_buffer_, rows_ + 1);
+
+    vcl_size_t num_entries_in_current_batch = 0;
+
+    const vcl_size_t shared_mem_size = 1024; // number of column indices loaded to shared memory, number of floating point values loaded to shared memory
+
+    row_block_num_ = 0;
+    row_blocks.set(0, 0);
+    for (vcl_size_t i=0; i<rows_; ++i)
+    {
+      vcl_size_t entries_in_row = vcl_size_t(row_buffer[i+1]) - vcl_size_t(row_buffer[i]);
+      num_entries_in_current_batch += entries_in_row;
+
+      if (num_entries_in_current_batch > shared_mem_size)
+      {
+        vcl_size_t rows_in_batch = i - row_blocks[row_block_num_];
+        if (rows_in_batch > 0) // at least one full row is in the batch. Use current row in next batch.
+          row_blocks.set(++row_block_num_, i--);
+        else // row is larger than buffer in shared memory
+          row_blocks.set(++row_block_num_, i+1);
+        num_entries_in_current_batch = 0;
+      }
+    }
+    if (num_entries_in_current_batch > 0)
+      row_blocks.set(++row_block_num_, rows_);
+
+    if (row_block_num_ > 0) //matrix might be empty...
+      viennacl::backend::memory_create(row_blocks_,
+                                       row_blocks.element_size() * (row_block_num_ + 1),
+                                       viennacl::traits::context(row_buffer_), row_blocks.get());
+
+  }
+
+private:
+  // /** @brief Copy constructor is by now not available. */
+  //compressed_matrix(compressed_matrix const &);
+
+private:
+
+  vcl_size_t rows_;
+  vcl_size_t cols_;
+  vcl_size_t nonzeros_;
+  vcl_size_t row_block_num_;
+  handle_type row_buffer_;
+  handle_type row_blocks_;
+  handle_type col_buffer_;
+  handle_type elements_;
+};
+
+/** @brief Output stream support for compressed_matrix. Output format is same as MATLAB, Octave, or SciPy
+  *
+  * @param os   STL output stream
+  * @param A    The compressed matrix to be printed.
+*/
+template<typename NumericT, unsigned int AlignmentV>
+std::ostream & operator<<(std::ostream & os, compressed_matrix<NumericT, AlignmentV> const & A)
+{
+  std::vector<std::map<unsigned int, NumericT> > tmp(A.size1());
+  viennacl::copy(A, tmp);
+  os << "compressed_matrix of size (" << A.size1() << ", " << A.size2() << ") with " << A.nnz() << " nonzeros:" << std::endl;
+
+  for (vcl_size_t i=0; i<A.size1(); ++i)
+  {
+    for (typename std::map<unsigned int, NumericT>::const_iterator it = tmp[i].begin(); it != tmp[i].end(); ++it)
+      os << "  (" << i << ", " << it->first << ")\t" << it->second << std::endl;
+  }
+  return os;
+}
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+  // x = A * y
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x = A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs = temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(0));
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x += A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs += temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(1));
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x -= A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs -= temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(-1), lhs, T(1));
+    }
+  };
+
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const compressed_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const compressed_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs += temp_result;
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const compressed_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs -= temp_result;
+    }
+  };
+
+} // namespace detail
+} // namespace linalg
+  /** \endcond */
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/context.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/context.hpp b/native-viennaCL/src/main/cpp/viennacl/context.hpp
new file mode 100644
index 0000000..ed00c39
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/context.hpp
@@ -0,0 +1,88 @@
+#ifndef VIENNACL_CONTEXT_HPP_
+#define VIENNACL_CONTEXT_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/context.hpp
+    @brief Implementation of a OpenCL-like context, which serves as a unification of {OpenMP, CUDA, OpenCL} at the user API.
+*/
+
+#include <vector>
+#include <stddef.h>
+#include <assert.h>
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/forwards.h"
+#include "viennacl/backend/mem_handle.hpp"
+
+namespace viennacl
+{
+/** @brief Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also suitable for CUDA and OpenMP
+  *
+  * Context objects are used to distinguish between different memory domains. One context may refer to an OpenCL device, another context may refer to a CUDA device, and a third context to main RAM.
+  * Thus, operations are only defined on objects residing on the same context.
+  */
+class context
+{
+public:
+  context() : mem_type_(viennacl::backend::default_memory_type())
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    if (mem_type_ == OPENCL_MEMORY)
+      ocl_context_ptr_ = &viennacl::ocl::current_context();
+    else
+      ocl_context_ptr_ = NULL;
+#endif
+  }
+
+  explicit context(viennacl::memory_types mtype) : mem_type_(mtype)
+  {
+    if (mem_type_ == MEMORY_NOT_INITIALIZED)
+      mem_type_ = viennacl::backend::default_memory_type();
+#ifdef VIENNACL_WITH_OPENCL
+    if (mem_type_ == OPENCL_MEMORY)
+      ocl_context_ptr_ = &viennacl::ocl::current_context();
+    else
+      ocl_context_ptr_ = NULL;
+#endif
+  }
+
+#ifdef VIENNACL_WITH_OPENCL
+  context(viennacl::ocl::context const & ctx) : mem_type_(OPENCL_MEMORY), ocl_context_ptr_(&ctx) {}
+
+  viennacl::ocl::context const & opencl_context() const
+  {
+    assert(mem_type_ == OPENCL_MEMORY && bool("Context type is not OpenCL"));
+    return *ocl_context_ptr_;
+  }
+#endif
+
+  // TODO: Add CUDA and OpenMP contexts
+
+  viennacl::memory_types  memory_type() const { return mem_type_; }
+
+private:
+  viennacl::memory_types   mem_type_;
+#ifdef VIENNACL_WITH_OPENCL
+  viennacl::ocl::context const * ocl_context_ptr_;
+#endif
+};
+
+
+}
+
+#endif


[40/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/amg.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/amg.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/amg.hpp
new file mode 100644
index 0000000..0b81203
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/amg.hpp
@@ -0,0 +1,398 @@
+#ifndef VIENNACL_LINALG_AMG_HPP_
+#define VIENNACL_LINALG_AMG_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/amg.hpp
+    @brief Main include file for algebraic multigrid (AMG) preconditioners.  Experimental.
+
+    Implementation contributed by Markus Wagner
+*/
+
+#include <vector>
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/amg_operations.hpp"
+#include "viennacl/tools/timer.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/lu.hpp"
+
+#include <map>
+
+#ifdef VIENNACL_WITH_OPENMP
+ #include <omp.h>
+#endif
+
+#define VIENNACL_AMG_MAX_LEVELS 20
+
+namespace viennacl
+{
+namespace linalg
+{
+
+class amg_coarse_problem_too_large_exception : public std::runtime_error
+{
+public:
+  amg_coarse_problem_too_large_exception(std::string const & msg, vcl_size_t num_points) : std::runtime_error(msg), c_points_(num_points) {}
+
+  /** @brief Returns the number of coarse points for which no further coarsening could be applied */
+  vcl_size_t coarse_points() const { return c_points_; }
+
+private:
+  vcl_size_t c_points_;
+};
+
+
+namespace detail
+{
+  /** @brief Sparse Galerkin product: Calculates A_coarse = trans(P)*A_fine*P = R*A_fine*P
+    *
+    * @param A_fine    Operator matrix on fine grid (quadratic)
+    * @param P         Prolongation/Interpolation matrix
+    * @param R         Restriction matrix
+    * @param A_coarse  Result matrix on coarse grid (Galerkin operator)
+    */
+  template<typename NumericT>
+  void amg_galerkin_prod(compressed_matrix<NumericT> & A_fine,
+                         compressed_matrix<NumericT> & P,
+                         compressed_matrix<NumericT> & R, //P^T
+                         compressed_matrix<NumericT> & A_coarse)
+  {
+
+    compressed_matrix<NumericT> A_fine_times_P(viennacl::traits::context(A_fine));
+
+    // transpose P in memory (no known way of efficiently multiplying P^T * B for CSR-matrices P and B):
+    viennacl::linalg::detail::amg::amg_transpose(P, R);
+
+    // compute Galerkin product using a temporary for the result of A_fine * P
+    A_fine_times_P = viennacl::linalg::prod(A_fine, P);
+    A_coarse = viennacl::linalg::prod(R, A_fine_times_P);
+
+  }
+
+
+  /** @brief Setup AMG preconditioner
+  *
+  * @param list_of_A                  Operator matrices on all levels
+  * @param list_of_P                  Prolongation/Interpolation operators on all levels
+  * @param list_of_R                  Restriction operators on all levels
+  * @param list_of_amg_level_context  Auxiliary datastructures for managing the grid hierarchy (coarse nodes, etc.)
+  * @param tag                        AMG preconditioner tag
+  */
+  template<typename NumericT, typename AMGContextListT>
+  vcl_size_t amg_setup(std::vector<compressed_matrix<NumericT> > & list_of_A,
+                       std::vector<compressed_matrix<NumericT> > & list_of_P,
+                       std::vector<compressed_matrix<NumericT> > & list_of_R,
+                       AMGContextListT & list_of_amg_level_context,
+                       amg_tag & tag)
+  {
+    // Set number of iterations. If automatic coarse grid construction is chosen (0), then set a maximum size and stop during the process.
+    vcl_size_t iterations = tag.get_coarse_levels();
+    if (iterations == 0)
+      iterations = VIENNACL_AMG_MAX_LEVELS;
+
+    for (vcl_size_t i=0; i<iterations; ++i)
+    {
+      list_of_amg_level_context[i].switch_context(tag.get_setup_context());
+      list_of_amg_level_context[i].resize(list_of_A[i].size1(), list_of_A[i].nnz());
+
+      // Construct C and F points on coarse level (i is fine level, i+1 coarse level).
+      detail::amg::amg_coarse(list_of_A[i], list_of_amg_level_context[i], tag);
+
+      // Calculate number of C and F points on level i.
+      unsigned int c_points = list_of_amg_level_context[i].num_coarse_;
+      unsigned int f_points = static_cast<unsigned int>(list_of_A[i].size1()) - c_points;
+
+      if (f_points == 0 && c_points > tag.get_coarsening_cutoff())
+      {
+        std::stringstream ss;
+        ss << "No further coarsening possible (" << c_points << " coarse points). Consider changing the strong connection threshold or increasing the coarsening cutoff." << std::endl;
+        throw amg_coarse_problem_too_large_exception(ss.str(), c_points);
+      }
+
+      // Stop routine when the maximal coarse level is found (no C or F point). Coarsest level is level i.
+      if (c_points == 0 || f_points == 0)
+        break;
+
+      // Construct interpolation matrix for level i.
+      detail::amg::amg_interpol(list_of_A[i], list_of_P[i], list_of_amg_level_context[i], tag);
+
+      // Compute coarse grid operator (A[i+1] = R * A[i] * P) with R = trans(P).
+      amg_galerkin_prod(list_of_A[i], list_of_P[i], list_of_R[i], list_of_A[i+1]);
+
+      // send matrices to target context:
+      list_of_A[i].switch_memory_context(tag.get_target_context());
+      list_of_P[i].switch_memory_context(tag.get_target_context());
+      list_of_R[i].switch_memory_context(tag.get_target_context());
+
+      // If Limit of coarse points is reached then stop. Coarsest level is level i+1.
+      if (tag.get_coarse_levels() == 0 && c_points <= tag.get_coarsening_cutoff())
+        return i+1;
+    }
+
+    return iterations;
+  }
+
+
+  /** @brief Initialize AMG preconditioner
+  *
+  * @param mat                        System matrix
+  * @param list_of_A                  Operator matrices on all levels
+  * @param list_of_P                  Prolongation/Interpolation operators on all levels
+  * @param list_of_R                  Restriction operators on all levels
+  * @param list_of_amg_level_context  Auxiliary datastructures for managing the grid hierarchy (coarse nodes, etc.)
+  * @param tag                        AMG preconditioner tag
+  */
+  template<typename MatrixT, typename InternalT1, typename InternalT2>
+  void amg_init(MatrixT const & mat, InternalT1 & list_of_A, InternalT1 & list_of_P, InternalT1 & list_of_R, InternalT2 & list_of_amg_level_context, amg_tag & tag)
+  {
+    typedef typename InternalT1::value_type SparseMatrixType;
+
+    vcl_size_t num_levels = (tag.get_coarse_levels() > 0) ? tag.get_coarse_levels() : VIENNACL_AMG_MAX_LEVELS;
+
+    list_of_A.resize(num_levels+1, SparseMatrixType(tag.get_setup_context()));
+    list_of_P.resize(num_levels,   SparseMatrixType(tag.get_setup_context()));
+    list_of_R.resize(num_levels,   SparseMatrixType(tag.get_setup_context()));
+    list_of_amg_level_context.resize(num_levels);
+
+    // Insert operator matrix as operator for finest level.
+    //SparseMatrixType A0(mat);
+    //A.insert_element(0, A0);
+    list_of_A[0].switch_memory_context(viennacl::traits::context(mat));
+    list_of_A[0] = mat;
+    list_of_A[0].switch_memory_context(tag.get_setup_context());
+  }
+
+  /** @brief Setup data structures for precondition phase for later use on the GPU
+  *
+  * @param result          Result vector on all levels
+  * @param result_backup   Copy of result vector on all levels
+  * @param rhs             RHS vector on all levels
+  * @param residual        Residual vector on all levels
+  * @param A               Operators matrices on all levels from setup phase
+  * @param coarse_levels   Number of coarse levels for which the datastructures should be set up.
+  * @param tag             AMG preconditioner tag
+  */
+  template<typename InternalVectorT, typename SparseMatrixT>
+  void amg_setup_apply(InternalVectorT & result,
+                       InternalVectorT & result_backup,
+                       InternalVectorT & rhs,
+                       InternalVectorT & residual,
+                       SparseMatrixT const & A,
+                       vcl_size_t coarse_levels,
+                       amg_tag const & tag)
+  {
+    typedef typename InternalVectorT::value_type VectorType;
+
+    result.resize(coarse_levels + 1);
+    result_backup.resize(coarse_levels + 1);
+    rhs.resize(coarse_levels + 1);
+    residual.resize(coarse_levels);
+
+    for (vcl_size_t level=0; level <= coarse_levels; ++level)
+    {
+             result[level] = VectorType(A[level].size1(), tag.get_target_context());
+      result_backup[level] = VectorType(A[level].size1(), tag.get_target_context());
+                rhs[level] = VectorType(A[level].size1(), tag.get_target_context());
+    }
+    for (vcl_size_t level=0; level < coarse_levels; ++level)
+    {
+      residual[level] = VectorType(A[level].size1(), tag.get_target_context());
+    }
+  }
+
+
+  /** @brief Pre-compute LU factorization for direct solve (ublas library).
+  *
+  * Speeds up precondition phase as this is computed only once overall instead of once per iteration.
+  *
+  * @param op           Operator matrix for direct solve
+  * @param A            Operator matrix on coarsest level
+  * @param tag          AMG preconditioner tag
+  */
+  template<typename NumericT, typename SparseMatrixT>
+  void amg_lu(viennacl::matrix<NumericT> & op,
+              SparseMatrixT const & A,
+              amg_tag const & tag)
+  {
+    op.switch_memory_context(tag.get_setup_context());
+    op.resize(A.size1(), A.size2(), false);
+    viennacl::linalg::detail::amg::assign_to_dense(A, op);
+
+    viennacl::linalg::lu_factorize(op);
+    op.switch_memory_context(tag.get_target_context());
+  }
+
+}
+
+/** @brief AMG preconditioner class, can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class amg_precond;
+
+
+/** @brief AMG preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class amg_precond< compressed_matrix<NumericT, AlignmentV> >
+{
+  typedef viennacl::compressed_matrix<NumericT, AlignmentV> SparseMatrixType;
+  typedef viennacl::vector<NumericT>                        VectorType;
+  typedef detail::amg::amg_level_context                    AMGContextType;
+
+public:
+
+  amg_precond() {}
+
+  /** @brief The constructor. Builds data structures.
+  *
+  * @param mat  System matrix
+  * @param tag  The AMG tag
+  */
+  amg_precond(compressed_matrix<NumericT, AlignmentV> const & mat,
+              amg_tag const & tag)
+  {
+    tag_ = tag;
+
+    // Initialize data structures.
+    detail::amg_init(mat, A_list_, P_list_, R_list_, amg_context_list_, tag_);
+  }
+
+  /** @brief Start setup phase for this class and copy data structures.
+  */
+  void setup()
+  {
+    // Start setup phase.
+    vcl_size_t num_coarse_levels = detail::amg_setup(A_list_, P_list_, R_list_, amg_context_list_, tag_);
+
+    // Setup precondition phase (Data structures).
+    detail::amg_setup_apply(result_list_, result_backup_list_, rhs_list_, residual_list_, A_list_, num_coarse_levels, tag_);
+
+    // LU factorization for direct solve.
+    detail::amg_lu(coarsest_op_, A_list_[num_coarse_levels], tag_);
+  }
+
+
+  /** @brief Precondition Operation
+  *
+  * @param vec       The vector to which preconditioning is applied to
+  */
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    vcl_size_t level;
+
+    // Precondition operation (Yang, p.3).
+    rhs_list_[0] = vec;
+
+    // Part 1: Restrict down to coarsest level
+    for (level=0; level < residual_list_.size(); level++)
+    {
+      result_list_[level].clear();
+
+      // Apply Smoother presmooth_ times.
+      viennacl::linalg::detail::amg::smooth_jacobi(static_cast<unsigned int>(tag_.get_presmooth_steps()),
+                                                   A_list_[level],
+                                                   result_list_[level],
+                                                   result_backup_list_[level],
+                                                   rhs_list_[level],
+                                                   static_cast<NumericT>(tag_.get_jacobi_weight()));
+
+      // Compute residual.
+      //residual[level] = rhs_[level] - viennacl::linalg::prod(A_[level], result_[level]);
+      residual_list_[level] = viennacl::linalg::prod(A_list_[level], result_list_[level]);
+      residual_list_[level] = rhs_list_[level] - residual_list_[level];
+
+      // Restrict to coarse level. Result is RHS of coarse level equation.
+      //residual_coarse[level] = viennacl::linalg::prod(R[level],residual[level]);
+      rhs_list_[level+1] = viennacl::linalg::prod(R_list_[level], residual_list_[level]);
+    }
+
+    // Part 2: On highest level use direct solve to solve equation (on the CPU)
+    result_list_[level] = rhs_list_[level];
+    viennacl::linalg::lu_substitute(coarsest_op_, result_list_[level]);
+
+    // Part 3: Prolongation to finest level
+    for (int level2 = static_cast<int>(residual_list_.size()-1); level2 >= 0; level2--)
+    {
+      level = static_cast<vcl_size_t>(level2);
+
+      // Interpolate error to fine level and correct solution.
+      result_backup_list_[level] = viennacl::linalg::prod(P_list_[level], result_list_[level+1]);
+      result_list_[level] += result_backup_list_[level];
+
+      // Apply Smoother postsmooth_ times.
+      viennacl::linalg::detail::amg::smooth_jacobi(static_cast<unsigned int>(tag_.get_postsmooth_steps()),
+                                                   A_list_[level],
+                                                   result_list_[level],
+                                                   result_backup_list_[level],
+                                                   rhs_list_[level],
+                                                   static_cast<NumericT>(tag_.get_jacobi_weight()));
+    }
+    vec = result_list_[0];
+  }
+
+  /** @brief Returns the total number of multigrid levels in the hierarchy including the finest level. */
+  vcl_size_t levels() const { return residual_list_.size(); }
+
+
+  /** @brief Returns the problem/operator size at the respective multigrid level
+    *
+    * @param level     Index of the multigrid level. 0 is the finest level, levels() - 1 is the coarsest level.
+    */
+  vcl_size_t size(vcl_size_t level) const
+  {
+    assert(level < levels() && bool("Level index out of bounds!"));
+    return residual_list_[level].size();
+  }
+
+  /** @brief Returns the associated preconditioner tag containing the configuration for the multigrid preconditioner. */
+  amg_tag const & tag() const { return tag_; }
+
+private:
+  std::vector<SparseMatrixType> A_list_;
+  std::vector<SparseMatrixType> P_list_;
+  std::vector<SparseMatrixType> R_list_;
+  std::vector<AMGContextType>   amg_context_list_;
+
+  viennacl::matrix<NumericT>        coarsest_op_;
+
+  mutable std::vector<VectorType> result_list_;
+  mutable std::vector<VectorType> result_backup_list_;
+  mutable std::vector<VectorType> rhs_list_;
+  mutable std::vector<VectorType> residual_list_;
+
+  amg_tag tag_;
+};
+
+}
+}
+
+
+
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/amg_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/amg_operations.hpp
new file mode 100644
index 0000000..9c7f79f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/amg_operations.hpp
@@ -0,0 +1,238 @@
+#ifndef VIENNACL_LINALG_AMG_OPERATIONS_HPP_
+#define VIENNACL_LINALG_AMG_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/amg_operations.hpp
+    @brief Implementations of operations for algebraic multigrid
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+#include "viennacl/linalg/host_based/amg_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/amg_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/amg_operations.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace amg
+{
+
+template<typename NumericT, typename AMGContextT>
+void amg_influence(compressed_matrix<NumericT> const & A, AMGContextT & amg_context, amg_tag & tag)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::amg::amg_influence(A, amg_context, tag);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::amg::amg_influence(A, amg_context, tag);
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case viennacl::CUDA_MEMORY:
+      viennacl::linalg::cuda::amg::amg_influence(A, amg_context, tag);
+      break;
+#endif
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+
+template<typename NumericT, typename AMGContextT>
+void amg_coarse(compressed_matrix<NumericT> const & A, AMGContextT & amg_context, amg_tag & tag)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::amg::amg_coarse(A, amg_context, tag);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::amg::amg_coarse(A, amg_context, tag);
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case viennacl::CUDA_MEMORY:
+      viennacl::linalg::cuda::amg::amg_coarse(A, amg_context, tag);
+      break;
+#endif
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+
+template<typename NumericT, typename AMGContextT>
+void amg_interpol(compressed_matrix<NumericT> const & A,
+                  compressed_matrix<NumericT>       & P,
+                  AMGContextT & amg_context,
+                  amg_tag & tag)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::amg::amg_interpol(A, P, amg_context, tag);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::amg::amg_interpol(A, P, amg_context, tag);
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case viennacl::CUDA_MEMORY:
+      viennacl::linalg::cuda::amg::amg_interpol(A, P, amg_context, tag);
+      break;
+#endif
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+
+template<typename NumericT>
+void amg_transpose(compressed_matrix<NumericT> & A,
+                   compressed_matrix<NumericT> & B)
+{
+  viennacl::context orig_ctx = viennacl::traits::context(A);
+  viennacl::context cpu_ctx(viennacl::MAIN_MEMORY);
+  (void)orig_ctx;
+  (void)cpu_ctx;
+
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::amg::amg_transpose(A, B);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case viennacl::OPENCL_MEMORY:
+      A.switch_memory_context(cpu_ctx);
+      B.switch_memory_context(cpu_ctx);
+      viennacl::linalg::host_based::amg::amg_transpose(A, B);
+      A.switch_memory_context(orig_ctx);
+      B.switch_memory_context(orig_ctx);
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case viennacl::CUDA_MEMORY:
+      A.switch_memory_context(cpu_ctx);
+      B.switch_memory_context(cpu_ctx);
+      viennacl::linalg::host_based::amg::amg_transpose(A, B);
+      A.switch_memory_context(orig_ctx);
+      B.switch_memory_context(orig_ctx);
+      //viennacl::linalg::cuda::amg_transpose(A, B);
+      break;
+#endif
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+/** Assign sparse matrix A to dense matrix B */
+template<typename SparseMatrixType, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+assign_to_dense(SparseMatrixType const & A,
+                viennacl::matrix_base<NumericT> & B)
+{
+  assert( (A.size1() == B.size1()) && bool("Size check failed for assignment to dense matrix: size1(A) != size1(B)"));
+  assert( (A.size2() == B.size1()) && bool("Size check failed for assignment to dense matrix: size2(A) != size2(B)"));
+
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::amg::assign_to_dense(A, B);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::amg::assign_to_dense(A, B);
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case viennacl::CUDA_MEMORY:
+      viennacl::linalg::cuda::amg::assign_to_dense(A, B);
+      break;
+#endif
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+template<typename NumericT>
+void smooth_jacobi(unsigned int iterations,
+                   compressed_matrix<NumericT> const & A,
+                   vector<NumericT> & x,
+                   vector<NumericT> & x_backup,
+                   vector<NumericT> const & rhs_smooth,
+                   NumericT weight)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::amg::smooth_jacobi(iterations, A, x, x_backup, rhs_smooth, weight);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::amg::smooth_jacobi(iterations, A, x, x_backup, rhs_smooth, weight);
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case viennacl::CUDA_MEMORY:
+      viennacl::linalg::cuda::amg::smooth_jacobi(iterations, A, x, x_backup, rhs_smooth, weight);
+      break;
+#endif
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+} //namespace amg
+} //namespace detail
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/bicgstab.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/bicgstab.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/bicgstab.hpp
new file mode 100644
index 0000000..57bc89a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/bicgstab.hpp
@@ -0,0 +1,598 @@
+#ifndef VIENNACL_LINALG_BICGSTAB_HPP_
+#define VIENNACL_LINALG_BICGSTAB_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file bicgstab.hpp
+    @brief The stabilized bi-conjugate gradient method is implemented here
+*/
+
+#include <vector>
+#include <cmath>
+#include <numeric>
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/traits/clear.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/context.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/linalg/iterative_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for the stabilized Bi-conjugate gradient solver. Used for supplying solver parameters and for dispatching the solve() function
+*/
+class bicgstab_tag
+{
+public:
+  /** @brief The constructor
+  *
+  * @param tol              Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||)
+  * @param max_iters        The maximum number of iterations
+  * @param max_iters_before_restart   The maximum number of iterations before BiCGStab is reinitialized (to avoid accumulation of round-off errors)
+  */
+  bicgstab_tag(double tol = 1e-8, vcl_size_t max_iters = 400, vcl_size_t max_iters_before_restart = 200)
+    : tol_(tol), abs_tol_(0), iterations_(max_iters), iterations_before_restart_(max_iters_before_restart) {}
+
+  /** @brief Returns the relative tolerance */
+  double tolerance() const { return tol_; }
+
+  /** @brief Returns the absolute tolerance */
+  double abs_tolerance() const { return abs_tol_; }
+  /** @brief Sets the absolute tolerance */
+  void abs_tolerance(double new_tol) { if (new_tol >= 0) abs_tol_ = new_tol; }
+
+  /** @brief Returns the maximum number of iterations */
+  vcl_size_t max_iterations() const { return iterations_; }
+  /** @brief Returns the maximum number of iterations before a restart*/
+  vcl_size_t max_iterations_before_restart() const { return iterations_before_restart_; }
+
+  /** @brief Return the number of solver iterations: */
+  vcl_size_t iters() const { return iters_taken_; }
+  void iters(vcl_size_t i) const { iters_taken_ = i; }
+
+  /** @brief Returns the estimated relative error at the end of the solver run */
+  double error() const { return last_error_; }
+  /** @brief Sets the estimated relative error at the end of the solver run */
+  void error(double e) const { last_error_ = e; }
+
+private:
+  double tol_;
+  double abs_tol_;
+  vcl_size_t iterations_;
+  vcl_size_t iterations_before_restart_;
+
+  //return values from solver
+  mutable vcl_size_t iters_taken_;
+  mutable double last_error_;
+};
+
+
+
+namespace detail
+{
+  /** @brief Implementation of a pipelined stabilized Bi-conjugate gradient solver */
+  template<typename MatrixT, typename NumericT>
+  viennacl::vector<NumericT> pipelined_solve(MatrixT const & A, //MatrixType const & A,
+                                             viennacl::vector_base<NumericT> const & rhs,
+                                             bicgstab_tag const & tag,
+                                             viennacl::linalg::no_precond,
+                                             bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                             void *monitor_data = NULL)
+  {
+    viennacl::vector<NumericT> result = viennacl::zero_vector<NumericT>(rhs.size(), viennacl::traits::context(rhs));
+
+    viennacl::vector<NumericT> residual = rhs;
+    viennacl::vector<NumericT> p = rhs;
+    viennacl::vector<NumericT> r0star = rhs;
+    viennacl::vector<NumericT> Ap = rhs;
+    viennacl::vector<NumericT> s  = rhs;
+    viennacl::vector<NumericT> As = rhs;
+
+    // Layout of temporary buffer:
+    //  chunk 0: <residual, r_0^*>
+    //  chunk 1: <As, As>
+    //  chunk 2: <As, s>
+    //  chunk 3: <Ap, r_0^*>
+    //  chunk 4: <As, r_0^*>
+    //  chunk 5: <s, s>
+    vcl_size_t buffer_size_per_vector = 256;
+    vcl_size_t num_buffer_chunks = 6;
+    viennacl::vector<NumericT> inner_prod_buffer = viennacl::zero_vector<NumericT>(num_buffer_chunks*buffer_size_per_vector, viennacl::traits::context(rhs)); // temporary buffer
+    std::vector<NumericT>      host_inner_prod_buffer(inner_prod_buffer.size());
+
+    NumericT norm_rhs_host = viennacl::linalg::norm_2(residual);
+    NumericT beta;
+    NumericT alpha;
+    NumericT omega;
+    NumericT residual_norm = norm_rhs_host;
+    inner_prod_buffer[0] = norm_rhs_host * norm_rhs_host;
+
+    NumericT  r_dot_r0 = 0;
+    NumericT As_dot_As = 0;
+    NumericT As_dot_s  = 0;
+    NumericT Ap_dot_r0 = 0;
+    NumericT As_dot_r0 = 0;
+    NumericT  s_dot_s  = 0;
+
+    if (norm_rhs_host <= tag.abs_tolerance()) //solution is zero if RHS norm is zero
+      return result;
+
+    for (vcl_size_t i = 0; i < tag.max_iterations(); ++i)
+    {
+      tag.iters(i+1);
+      // Ap = A*p_j
+      // Ap_dot_r0 = <Ap, r_0^*>
+      viennacl::linalg::pipelined_bicgstab_prod(A, p, Ap, r0star,
+                                                inner_prod_buffer, buffer_size_per_vector, 3*buffer_size_per_vector);
+
+      //////// first (weak) synchronization point ////
+
+      ///// method 1: compute alpha on host:
+      //
+      //// we only need the second chunk of the buffer for computing Ap_dot_r0:
+      //viennacl::fast_copy(inner_prod_buffer.begin(), inner_prod_buffer.end(), host_inner_prod_buffer.begin());
+      //Ap_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() +     buffer_size_per_vector, host_inner_prod_buffer.begin() + 2 * buffer_size_per_vector, ScalarType(0));
+
+      //alpha = residual_dot_r0 / Ap_dot_r0;
+
+      //// s_j = r_j - alpha_j q_j
+      //s = residual - alpha * Ap;
+
+      ///// method 2: compute alpha on device:
+      // s = r - alpha * Ap
+      // <s, s> first stage
+      // dump alpha at end of inner_prod_buffer
+      viennacl::linalg::pipelined_bicgstab_update_s(s, residual, Ap,
+                                                    inner_prod_buffer, buffer_size_per_vector, 5*buffer_size_per_vector);
+
+      // As = A*s_j
+      // As_dot_As = <As, As>
+      // As_dot_s  = <As, s>
+      // As_dot_r0 = <As, r_0^*>
+      viennacl::linalg::pipelined_bicgstab_prod(A, s, As, r0star,
+                                                inner_prod_buffer, buffer_size_per_vector, 4*buffer_size_per_vector);
+
+      //////// second (strong) synchronization point ////
+
+      viennacl::fast_copy(inner_prod_buffer.begin(), inner_prod_buffer.end(), host_inner_prod_buffer.begin());
+
+      typedef typename std::vector<NumericT>::difference_type       difference_type;
+
+       r_dot_r0 = std::accumulate(host_inner_prod_buffer.begin(),                                               host_inner_prod_buffer.begin() + difference_type(    buffer_size_per_vector), NumericT(0));
+      As_dot_As = std::accumulate(host_inner_prod_buffer.begin() + difference_type(    buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(2 * buffer_size_per_vector), NumericT(0));
+      As_dot_s  = std::accumulate(host_inner_prod_buffer.begin() + difference_type(2 * buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(3 * buffer_size_per_vector), NumericT(0));
+      Ap_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() + difference_type(3 * buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(4 * buffer_size_per_vector), NumericT(0));
+      As_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() + difference_type(4 * buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(5 * buffer_size_per_vector), NumericT(0));
+       s_dot_s  = std::accumulate(host_inner_prod_buffer.begin() + difference_type(5 * buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(6 * buffer_size_per_vector), NumericT(0));
+
+      alpha =   r_dot_r0 / Ap_dot_r0;
+      beta  = - As_dot_r0 / Ap_dot_r0;
+      omega =   As_dot_s  / As_dot_As;
+
+      residual_norm = std::sqrt(s_dot_s - NumericT(2.0) * omega * As_dot_s + omega * omega *  As_dot_As);
+      if (monitor && monitor(result, std::fabs(residual_norm / norm_rhs_host), monitor_data))
+        break;
+      if (std::fabs(residual_norm / norm_rhs_host) < tag.tolerance() || residual_norm < tag.abs_tolerance())
+        break;
+
+      // x_{j+1} = x_j + alpha * p_j + omega * s_j
+      // r_{j+1} = s_j - omega * t_j
+      // p_{j+1} = r_{j+1} + beta * (p_j - omega * q_j)
+      // and compute first stage of r_dot_r0 = <r_{j+1}, r_o^*> for use in next iteration
+       viennacl::linalg::pipelined_bicgstab_vector_update(result, alpha, p, omega, s,
+                                                          residual, As,
+                                                          beta, Ap,
+                                                          r0star, inner_prod_buffer, buffer_size_per_vector);
+    }
+
+    //store last error estimate:
+    tag.error(residual_norm / norm_rhs_host);
+
+    return result;
+  }
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::compressed_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        bicgstab_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::coordinate_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        bicgstab_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::ell_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        bicgstab_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::sliced_ell_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        bicgstab_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::hyb_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        bicgstab_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  /** @brief Implementation of the unpreconditioned stabilized Bi-conjugate gradient solver
+  *
+  * Following the description in "Iterative Methods for Sparse Linear Systems" by Y. Saad
+  *
+  * @param matrix       The system matrix
+  * @param rhs          The load vector
+  * @param tag          Solver configuration tag
+  * @param monitor      A callback routine which is called at each GMRES restart
+  * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data
+  * @return The result vector
+  */
+  template<typename MatrixT, typename VectorT>
+  VectorT solve_impl(MatrixT const & matrix,
+                     VectorT const & rhs,
+                     bicgstab_tag const & tag,
+                     viennacl::linalg::no_precond,
+                     bool (*monitor)(VectorT const &, typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<VectorT>::type>::type, void*) = NULL,
+                     void *monitor_data = NULL)
+  {
+    typedef typename viennacl::result_of::value_type<VectorT>::type            NumericType;
+    typedef typename viennacl::result_of::cpu_value_type<NumericType>::type    CPU_NumericType;
+    VectorT result = rhs;
+    viennacl::traits::clear(result);
+
+    VectorT residual = rhs;
+    VectorT p = rhs;
+    VectorT r0star = rhs;
+    VectorT tmp0 = rhs;
+    VectorT tmp1 = rhs;
+    VectorT s = rhs;
+
+    CPU_NumericType norm_rhs_host = viennacl::linalg::norm_2(residual);
+    CPU_NumericType ip_rr0star = norm_rhs_host * norm_rhs_host;
+    CPU_NumericType beta;
+    CPU_NumericType alpha;
+    CPU_NumericType omega;
+    //ScalarType inner_prod_temp; //temporary variable for inner product computation
+    CPU_NumericType new_ip_rr0star = 0;
+    CPU_NumericType residual_norm = norm_rhs_host;
+
+    if (norm_rhs_host <= tag.abs_tolerance()) //solution is zero if RHS norm is zero
+      return result;
+
+    bool restart_flag = true;
+    vcl_size_t last_restart = 0;
+    for (vcl_size_t i = 0; i < tag.max_iterations(); ++i)
+    {
+      if (restart_flag)
+      {
+        residual = viennacl::linalg::prod(matrix, result);
+        residual = rhs - residual;
+        p = residual;
+        r0star = residual;
+        ip_rr0star = viennacl::linalg::norm_2(residual);
+        ip_rr0star *= ip_rr0star;
+        restart_flag = false;
+        last_restart = i;
+      }
+
+      tag.iters(i+1);
+      tmp0 = viennacl::linalg::prod(matrix, p);
+      alpha = ip_rr0star / viennacl::linalg::inner_prod(tmp0, r0star);
+
+      s = residual - alpha*tmp0;
+
+      tmp1 = viennacl::linalg::prod(matrix, s);
+      CPU_NumericType norm_tmp1 = viennacl::linalg::norm_2(tmp1);
+      omega = viennacl::linalg::inner_prod(tmp1, s) / (norm_tmp1 * norm_tmp1);
+
+      result += alpha * p + omega * s;
+      residual = s - omega * tmp1;
+
+      new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);
+      residual_norm = viennacl::linalg::norm_2(residual);
+      if (monitor && monitor(result, std::fabs(residual_norm / norm_rhs_host), monitor_data))
+        break;
+      if (std::fabs(residual_norm / norm_rhs_host) < tag.tolerance() || residual_norm < tag.abs_tolerance())
+        break;
+
+      beta = new_ip_rr0star / ip_rr0star * alpha/omega;
+      ip_rr0star = new_ip_rr0star;
+
+      if (    (ip_rr0star <= 0 && ip_rr0star >= 0)
+           || (omega <= 0 && omega >= 0)
+           || (i - last_restart > tag.max_iterations_before_restart())
+         ) //search direction degenerate. A restart might help
+        restart_flag = true;
+
+      // Execution of
+      //  p = residual + beta * (p - omega*tmp0);
+      // without introducing temporary vectors:
+      p -= omega * tmp0;
+      p = residual + beta * p;
+    }
+
+    //store last error estimate:
+    tag.error(residual_norm / norm_rhs_host);
+
+    return result;
+  }
+
+
+  /** @brief Implementation of the preconditioned stabilized Bi-conjugate gradient solver
+  *
+  * Following the description of the unpreconditioned case in "Iterative Methods for Sparse Linear Systems" by Y. Saad
+  *
+  * @param matrix       The system matrix
+  * @param rhs          The load vector
+  * @param tag          Solver configuration tag
+  * @param precond      A preconditioner. Precondition operation is done via member function apply()
+  * @param monitor      A callback routine which is called at each GMRES restart
+  * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data
+  * @return The result vector
+  */
+  template<typename MatrixT, typename VectorT, typename PreconditionerT>
+  VectorT solve_impl(MatrixT const & matrix,
+                     VectorT const & rhs,
+                     bicgstab_tag const & tag,
+                     PreconditionerT const & precond,
+                     bool (*monitor)(VectorT const &, typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<VectorT>::type>::type, void*) = NULL,
+                     void *monitor_data = NULL)
+  {
+    typedef typename viennacl::result_of::value_type<VectorT>::type            NumericType;
+    typedef typename viennacl::result_of::cpu_value_type<NumericType>::type    CPU_NumericType;
+    VectorT result = rhs;
+    viennacl::traits::clear(result);
+
+    VectorT residual = rhs;
+    VectorT r0star = residual;  //can be chosen arbitrarily in fact
+    VectorT tmp0 = rhs;
+    VectorT tmp1 = rhs;
+    VectorT s = rhs;
+
+    VectorT p = residual;
+
+    CPU_NumericType ip_rr0star = viennacl::linalg::norm_2(residual);
+    CPU_NumericType norm_rhs_host = viennacl::linalg::norm_2(residual);
+    CPU_NumericType beta;
+    CPU_NumericType alpha;
+    CPU_NumericType omega;
+    CPU_NumericType new_ip_rr0star = 0;
+    CPU_NumericType residual_norm = norm_rhs_host;
+
+    if (norm_rhs_host <= tag.abs_tolerance()) //solution is zero if RHS norm is zero
+      return result;
+
+    bool restart_flag = true;
+    vcl_size_t last_restart = 0;
+    for (unsigned int i = 0; i < tag.max_iterations(); ++i)
+    {
+      if (restart_flag)
+      {
+        residual = viennacl::linalg::prod(matrix, result);
+        residual = rhs - residual;
+        precond.apply(residual);
+        p = residual;
+        r0star = residual;
+        ip_rr0star = viennacl::linalg::norm_2(residual);
+        ip_rr0star *= ip_rr0star;
+        restart_flag = false;
+        last_restart = i;
+      }
+
+      tag.iters(i+1);
+      tmp0 = viennacl::linalg::prod(matrix, p);
+      precond.apply(tmp0);
+      alpha = ip_rr0star / viennacl::linalg::inner_prod(tmp0, r0star);
+
+      s = residual - alpha*tmp0;
+
+      tmp1 = viennacl::linalg::prod(matrix, s);
+      precond.apply(tmp1);
+      CPU_NumericType norm_tmp1 = viennacl::linalg::norm_2(tmp1);
+      omega = viennacl::linalg::inner_prod(tmp1, s) / (norm_tmp1 * norm_tmp1);
+
+      result += alpha * p + omega * s;
+      residual = s - omega * tmp1;
+
+      residual_norm = viennacl::linalg::norm_2(residual);
+      if (monitor && monitor(result, std::fabs(residual_norm / norm_rhs_host), monitor_data))
+        break;
+      if (residual_norm / norm_rhs_host < tag.tolerance() || residual_norm < tag.abs_tolerance())
+        break;
+
+      new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);
+
+      beta = new_ip_rr0star / ip_rr0star * alpha/omega;
+      ip_rr0star = new_ip_rr0star;
+
+      if ( (ip_rr0star >= 0 && ip_rr0star <= 0) || (omega >=0 && omega <= 0) || i - last_restart > tag.max_iterations_before_restart()) //search direction degenerate. A restart might help
+        restart_flag = true;
+
+      // Execution of
+      //  p = residual + beta * (p - omega*tmp0);
+      // without introducing temporary vectors:
+      p -= omega * tmp0;
+      p = residual + beta * p;
+
+      //std::cout << "Rel. Residual in current step: " << std::sqrt(std::fabs(viennacl::linalg::inner_prod(residual, residual) / norm_rhs_host)) << std::endl;
+    }
+
+    //store last error estimate:
+    tag.error(residual_norm / norm_rhs_host);
+
+    return result;
+  }
+
+}
+
+
+
+template<typename MatrixT, typename VectorT, typename PreconditionerT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, bicgstab_tag const & tag, PreconditionerT const & precond)
+{
+  return detail::solve_impl(matrix, rhs, tag, precond);
+}
+
+
+/** @brief Convenience overload for calling the preconditioned BiCGStab solver using types from the C++ STL.
+  *
+  * A std::vector<std::map<T, U> > matrix is convenient for e.g. finite element assembly.
+  * It is not the fastest option for setting up a system, but often it is fast enough - particularly for just trying things out.
+  */
+template<typename IndexT, typename NumericT, typename PreconditionerT>
+std::vector<NumericT> solve(std::vector< std::map<IndexT, NumericT> > const & A, std::vector<NumericT> const & rhs, bicgstab_tag const & tag, PreconditionerT const & precond)
+{
+  viennacl::compressed_matrix<NumericT> vcl_A;
+  viennacl::copy(A, vcl_A);
+
+  viennacl::vector<NumericT> vcl_rhs(rhs.size());
+  viennacl::copy(rhs, vcl_rhs);
+
+  viennacl::vector<NumericT> vcl_result = solve(vcl_A, vcl_rhs, tag, precond);
+
+  std::vector<NumericT> result(vcl_result.size());
+  viennacl::copy(vcl_result, result);
+  return result;
+}
+
+/** @brief Entry point for the unpreconditioned BiCGStab method.
+ *
+ *  @param matrix    The system matrix
+ *  @param rhs       Right hand side vector (load vector)
+ *  @param tag       A BiCGStab tag providing relative tolerances, etc.
+ */
+template<typename MatrixT, typename VectorT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, bicgstab_tag const & tag)
+{
+  return solve(matrix, rhs, tag, viennacl::linalg::no_precond());
+}
+
+
+
+template<typename VectorT>
+class bicgstab_solver
+{
+public:
+  typedef typename viennacl::result_of::cpu_value_type<VectorT>::type   numeric_type;
+
+  bicgstab_solver(bicgstab_tag const & tag) : tag_(tag), monitor_callback_(NULL), user_data_(NULL) {}
+
+  template<typename MatrixT, typename PreconditionerT>
+  VectorT operator()(MatrixT const & A, VectorT const & b, PreconditionerT const & precond) const
+  {
+    if (viennacl::traits::size(init_guess_) > 0) // take initial guess into account
+    {
+      VectorT mod_rhs = viennacl::linalg::prod(A, init_guess_);
+      mod_rhs = b - mod_rhs;
+      VectorT y = detail::solve_impl(A, mod_rhs, tag_, precond, monitor_callback_, user_data_);
+      return init_guess_ + y;
+    }
+    return detail::solve_impl(A, b, tag_, precond, monitor_callback_, user_data_);
+  }
+
+
+  template<typename MatrixT>
+  VectorT operator()(MatrixT const & A, VectorT const & b) const
+  {
+    return operator()(A, b, viennacl::linalg::no_precond());
+  }
+
+  /** @brief Specifies an initial guess for the iterative solver.
+    *
+    * An iterative solver for Ax = b with initial guess x_0 is equivalent to an iterative solver for Ay = b' := b - Ax_0, where x = x_0 + y.
+    */
+  void set_initial_guess(VectorT const & x) { init_guess_ = x; }
+
+  /** @brief Sets a monitor function pointer to be called in each iteration. Set to NULL to run without monitor.
+   *
+   *  The monitor function is called with the current guess for the result as first argument and the current relative residual estimate as second argument.
+   *  The third argument is a pointer to user-defined data, through which additional information can be passed.
+   *  This pointer needs to be set with set_monitor_data. If not set, NULL is passed.
+   *  If the montior function returns true, the solver terminates (either convergence or divergence).
+   */
+  void set_monitor(bool (*monitor_fun)(VectorT const &, numeric_type, void *), void *user_data)
+  {
+    monitor_callback_ = monitor_fun;
+    user_data_ = user_data;
+  }
+
+  /** @brief Returns the solver tag containing basic configuration such as tolerances, etc. */
+  bicgstab_tag const & tag() const { return tag_; }
+
+private:
+  bicgstab_tag  tag_;
+  VectorT       init_guess_;
+  bool          (*monitor_callback_)(VectorT const &, numeric_type, void *);
+  void          *user_data_;
+};
+
+
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/bisect.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/bisect.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/bisect.hpp
new file mode 100644
index 0000000..a2daf5e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/bisect.hpp
@@ -0,0 +1,179 @@
+#ifndef VIENNACL_LINALG_BISECT_HPP_
+#define VIENNACL_LINALG_BISECT_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/bisect.hpp
+*   @brief Implementation of the algorithm for finding eigenvalues of a tridiagonal matrix.
+*
+*   Contributed by Guenther Mader and Astrid Rupp.
+*/
+
+#include <vector>
+#include <cmath>
+#include <limits>
+#include <cstddef>
+#include "viennacl/meta/result_of.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+namespace detail
+{
+  /**
+  *    @brief overloaded function for copying vectors
+  */
+  template<typename NumericT, typename OtherVectorT>
+  void copy_vec_to_vec(viennacl::vector<NumericT> const & src, OtherVectorT & dest)
+  {
+    viennacl::copy(src, dest);
+  }
+
+  template<typename OtherVectorT, typename NumericT>
+  void copy_vec_to_vec(OtherVectorT const & src, viennacl::vector<NumericT> & dest)
+  {
+    viennacl::copy(src, dest);
+  }
+
+  template<typename VectorT1, typename VectorT2>
+  void copy_vec_to_vec(VectorT1 const & src, VectorT2 & dest)
+  {
+    for (vcl_size_t i=0; i<src.size(); ++i)
+      dest[i] = src[i];
+  }
+
+} //namespace detail
+
+/**
+*   @brief Implementation of the bisect-algorithm for the calculation of the eigenvalues of a tridiagonal matrix. Experimental - interface might change.
+*
+*   Refer to "Calculation of the Eigenvalues of a Symmetric Tridiagonal Matrix by the Method of Bisection" in the Handbook Series Linear Algebra, contributed by Barth, Martin, and Wilkinson.
+*   http://www.maths.ed.ac.uk/~aar/papers/bamawi.pdf
+*
+*   @param alphas       Elements of the main diagonal
+*   @param betas        Elements of the secondary diagonal
+*   @return             Returns the eigenvalues of the tridiagonal matrix defined by alpha and beta
+*/
+template<typename VectorT>
+std::vector<
+        typename viennacl::result_of::cpu_value_type<typename VectorT::value_type>::type
+        >
+bisect(VectorT const & alphas, VectorT const & betas)
+{
+  typedef typename viennacl::result_of::value_type<VectorT>::type           NumericType;
+  typedef typename viennacl::result_of::cpu_value_type<NumericType>::type   CPU_NumericType;
+
+  vcl_size_t size = betas.size();
+  std::vector<CPU_NumericType>  x_temp(size);
+
+
+  std::vector<CPU_NumericType> beta_bisect;
+  std::vector<CPU_NumericType> wu;
+
+  double rel_error = std::numeric_limits<CPU_NumericType>::epsilon();
+  beta_bisect.push_back(0);
+
+  for (vcl_size_t i = 1; i < size; i++)
+    beta_bisect.push_back(betas[i] * betas[i]);
+
+  double xmin = alphas[size - 1] - std::fabs(betas[size - 1]);
+  double xmax = alphas[size - 1] + std::fabs(betas[size - 1]);
+
+  for (vcl_size_t i = 0; i < size - 1; i++)
+  {
+    double h = std::fabs(betas[i]) + std::fabs(betas[i + 1]);
+    if (alphas[i] + h > xmax)
+      xmax = alphas[i] + h;
+    if (alphas[i] - h < xmin)
+      xmin = alphas[i] - h;
+  }
+
+
+  double eps1 = 1e-6;
+  /*double eps2 = (xmin + xmax > 0) ? (rel_error * xmax) : (-rel_error * xmin);
+  if (eps1 <= 0)
+    eps1 = eps2;
+  else
+    eps2 = 0.5 * eps1 + 7.0 * eps2; */
+
+  double x0 = xmax;
+
+  for (vcl_size_t i = 0; i < size; i++)
+  {
+    x_temp[i] = xmax;
+    wu.push_back(xmin);
+  }
+
+  for (long k = static_cast<long>(size) - 1; k >= 0; --k)
+  {
+    double xu = xmin;
+    for (long i = k; i >= 0; --i)
+    {
+      if (xu < wu[vcl_size_t(k-i)])
+      {
+        xu = wu[vcl_size_t(i)];
+        break;
+      }
+    }
+
+    if (x0 > x_temp[vcl_size_t(k)])
+      x0 = x_temp[vcl_size_t(k)];
+
+    double x1 = (xu + x0) / 2.0;
+    while (x0 - xu > 2.0 * rel_error * (std::fabs(xu) + std::fabs(x0)) + eps1)
+    {
+      vcl_size_t a = 0;
+      double q = 1;
+      for (vcl_size_t i = 0; i < size; i++)
+      {
+        if (q > 0 || q < 0)
+          q = alphas[i] - x1 - beta_bisect[i] / q;
+        else
+          q = alphas[i] - x1 - std::fabs(betas[i] / rel_error);
+
+        if (q < 0)
+          a++;
+      }
+
+      if (a <= static_cast<vcl_size_t>(k))
+      {
+        xu = x1;
+        if (a < 1)
+          wu[0] = x1;
+        else
+        {
+          wu[a] = x1;
+          if (x_temp[a - 1] > x1)
+              x_temp[a - 1] = x1;
+        }
+      }
+      else
+        x0 = x1;
+
+      x1 = (xu + x0) / 2.0;
+    }
+    x_temp[vcl_size_t(k)] = x1;
+  }
+  return x_temp;
+}
+
+} // end namespace linalg
+} // end namespace viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/bisect_gpu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/bisect_gpu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/bisect_gpu.hpp
new file mode 100644
index 0000000..6918b14
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/bisect_gpu.hpp
@@ -0,0 +1,173 @@
+#ifndef VIENNACL_LINALG_BISECT_GPU
+#define VIENNACL_LINALG_BISECT_GPU
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/bisect_gpu.hpp
+    @brief Implementation of an bisection algorithm for eigenvalues
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+// includes, system
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+#include "viennacl/linalg/detail/bisect/gerschgorin.hpp"
+#include "viennacl/linalg/detail/bisect/bisect_large.hpp"
+#include "viennacl/linalg/detail/bisect/bisect_small.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+///////////////////////////////////////////////////////////////////////////
+//! @brief bisect           The bisection algorithm computes the eigevalues
+//!                         of a symmetric tridiagonal matrix.
+//! @param diagonal         diagonal elements of the matrix
+//! @param superdiagonal    superdiagonal elements of the matrix
+//! @param eigenvalues      Vectors with the eigenvalues in ascending order
+//! @return                 return false if any errors occured
+///
+//! overloaded function template: std::vectors as parameters
+template<typename NumericT>
+bool
+bisect(const std::vector<NumericT> & diagonal, const std::vector<NumericT> & superdiagonal, std::vector<NumericT> & eigenvalues)
+{
+  assert(diagonal.size() == superdiagonal.size() &&
+         diagonal.size() == eigenvalues.size()   &&
+         bool("Input vectors do not have the same sizes!"));
+  bool bResult = false;
+  // flag if the matrix size is due to explicit user request
+  // desired precision of eigenvalues
+  NumericT  precision = static_cast<NumericT>(0.00001);
+  const unsigned int mat_size = static_cast<unsigned int>(diagonal.size());
+
+  // set up input
+  viennacl::linalg::detail::InputData<NumericT> input(diagonal, superdiagonal, mat_size);
+
+  NumericT lg =  FLT_MAX;
+  NumericT ug = -FLT_MAX;
+  // compute Gerschgorin interval
+  viennacl::linalg::detail::computeGerschgorin(input.std_a, input.std_b, mat_size, lg, ug);
+
+  // decide wheter the algorithm for small or for large matrices will be started
+  if (mat_size <= VIENNACL_BISECT_MAX_SMALL_MATRIX)
+  {
+    // initialize memory for result
+    viennacl::linalg::detail::ResultDataSmall<NumericT> result(mat_size);
+
+    // run the kernel
+    viennacl::linalg::detail::computeEigenvaluesSmallMatrix(input, result, mat_size, lg, ug, precision);
+
+    // get the result from the device and do some sanity checks,
+    viennacl::linalg::detail::processResultSmallMatrix(result, mat_size);
+    eigenvalues = result.std_eigenvalues;
+    bResult = true;
+  }
+
+  else
+  {
+    // initialize memory for result
+    viennacl::linalg::detail::ResultDataLarge<NumericT> result(mat_size);
+
+    // run the kernel
+    viennacl::linalg::detail::computeEigenvaluesLargeMatrix(input, result, mat_size, lg, ug, precision);
+
+    // get the result from the device and do some sanity checks
+    bResult = viennacl::linalg::detail::processResultDataLargeMatrix(result, mat_size);
+
+    eigenvalues = result.std_eigenvalues;
+  }
+  return bResult;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+//! @brief bisect           The bisection algorithm computes the eigevalues
+//!                         of a symmetric tridiagonal matrix.
+//! @param diagonal         diagonal elements of the matrix
+//! @param superdiagonal    superdiagonal elements of the matrix
+//! @param eigenvalues      Vectors with the eigenvalues in ascending order
+//! @return                 return false if any errors occured
+///
+//! overloaded function template: viennacl::vectors as parameters
+template<typename NumericT>
+bool
+bisect(const viennacl::vector<NumericT> & diagonal, const viennacl::vector<NumericT> & superdiagonal, viennacl::vector<NumericT> & eigenvalues)
+{
+  assert(diagonal.size() == superdiagonal.size() &&
+         diagonal.size() == eigenvalues.size()   &&
+         bool("Input vectors do not have the same sizes!"));
+  bool bResult = false;
+  // flag if the matrix size is due to explicit user request
+  // desired precision of eigenvalues
+  NumericT  precision = static_cast<NumericT>(0.00001);
+  const unsigned int mat_size = static_cast<unsigned int>(diagonal.size());
+
+  // set up input
+  viennacl::linalg::detail::InputData<NumericT> input(diagonal, superdiagonal, mat_size);
+
+  NumericT lg =  FLT_MAX;
+  NumericT ug = -FLT_MAX;
+  // compute Gerschgorin interval
+  viennacl::linalg::detail::computeGerschgorin(input.std_a, input.std_b, mat_size, lg, ug);
+
+  // decide wheter the algorithm for small or for large matrices will be started
+  if (mat_size <= VIENNACL_BISECT_MAX_SMALL_MATRIX)
+  {
+    // initialize memory for result
+    viennacl::linalg::detail::ResultDataSmall<NumericT> result(mat_size);
+
+    // run the kernel
+    viennacl::linalg::detail::computeEigenvaluesSmallMatrix(input, result, mat_size, lg, ug, precision);
+
+    // get the result from the device and do some sanity checks,
+    viennacl::linalg::detail::processResultSmallMatrix(result, mat_size);
+    copy(result.std_eigenvalues, eigenvalues);
+    bResult = true;
+  }
+
+  else
+  {
+    // initialize memory for result
+    viennacl::linalg::detail::ResultDataLarge<NumericT> result(mat_size);
+
+    // run the kernel
+    viennacl::linalg::detail::computeEigenvaluesLargeMatrix(input, result, mat_size, lg, ug, precision);
+
+    // get the result from the device and do some sanity checks
+    bResult = viennacl::linalg::detail::processResultDataLargeMatrix(result, mat_size);
+
+    copy(result.std_eigenvalues, eigenvalues);
+  }
+  return bResult;
+}
+} // namespace linalg
+} // namespace viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cg.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cg.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cg.hpp
new file mode 100644
index 0000000..93aae81
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cg.hpp
@@ -0,0 +1,440 @@
+#ifndef VIENNACL_LINALG_CG_HPP_
+#define VIENNACL_LINALG_CG_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cg.hpp
+    @brief The conjugate gradient method is implemented here
+*/
+
+#include <vector>
+#include <map>
+#include <cmath>
+#include <numeric>
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/traits/clear.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/linalg/iterative_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for the conjugate gradient Used for supplying solver parameters and for dispatching the solve() function
+*/
+class cg_tag
+{
+public:
+  /** @brief The constructor
+  *
+  * @param tol              Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||)
+  * @param max_iterations   The maximum number of iterations
+  */
+  cg_tag(double tol = 1e-8, unsigned int max_iterations = 300) : tol_(tol), abs_tol_(0), iterations_(max_iterations) {}
+
+  /** @brief Returns the relative tolerance */
+  double tolerance() const { return tol_; }
+
+  /** @brief Returns the absolute tolerance */
+  double abs_tolerance() const { return abs_tol_; }
+  /** @brief Sets the absolute tolerance */
+  void abs_tolerance(double new_tol) { if (new_tol >= 0) abs_tol_ = new_tol; }
+
+  /** @brief Returns the maximum number of iterations */
+  unsigned int max_iterations() const { return iterations_; }
+
+  /** @brief Return the number of solver iterations: */
+  unsigned int iters() const { return iters_taken_; }
+  void iters(unsigned int i) const { iters_taken_ = i; }
+
+  /** @brief Returns the estimated relative error at the end of the solver run */
+  double error() const { return last_error_; }
+  /** @brief Sets the estimated relative error at the end of the solver run */
+  void error(double e) const { last_error_ = e; }
+
+
+private:
+  double tol_;
+  double abs_tol_;
+  unsigned int iterations_;
+
+  //return values from solver
+  mutable unsigned int iters_taken_;
+  mutable double last_error_;
+};
+
+namespace detail
+{
+
+  /** @brief handles the no_precond case at minimal overhead */
+  template<typename VectorT, typename PreconditionerT>
+  class z_handler{
+  public:
+    z_handler(VectorT & residual) : z_(residual){ }
+    VectorT & get() { return z_; }
+  private:
+    VectorT z_;
+  };
+
+  template<typename VectorT>
+  class z_handler<VectorT, viennacl::linalg::no_precond>{
+  public:
+    z_handler(VectorT & residual) : presidual_(&residual){ }
+    VectorT & get() { return *presidual_; }
+  private:
+    VectorT * presidual_;
+  };
+
+}
+
+namespace detail
+{
+
+  /** @brief Implementation of a pipelined conjugate gradient algorithm (no preconditioner), specialized for ViennaCL types.
+  *
+  * Pipelined version from A. T. Chronopoulos and C. W. Gear, J. Comput. Appl. Math. 25(2), 153\u2013168 (1989)
+  *
+  * @param A            The system matrix
+  * @param rhs          The load vector
+  * @param tag          Solver configuration tag
+  * @param monitor      A callback routine which is called at each GMRES restart
+  * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data
+  * @return The result vector
+  */
+  //template<typename MatrixType, typename ScalarType>
+  template<typename MatrixT, typename NumericT>
+  viennacl::vector<NumericT> pipelined_solve(MatrixT const & A, //MatrixType const & A,
+                                             viennacl::vector<NumericT> const & rhs,
+                                             cg_tag const & tag,
+                                             viennacl::linalg::no_precond,
+                                             bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                             void *monitor_data = NULL)
+  {
+    typedef typename viennacl::vector<NumericT>::difference_type   difference_type;
+
+    viennacl::vector<NumericT> result(rhs);
+    viennacl::traits::clear(result);
+
+    viennacl::vector<NumericT> residual(rhs);
+    viennacl::vector<NumericT> p(rhs);
+    viennacl::vector<NumericT> Ap = viennacl::linalg::prod(A, p);
+    viennacl::vector<NumericT> inner_prod_buffer = viennacl::zero_vector<NumericT>(3*256, viennacl::traits::context(rhs)); // temporary buffer
+    std::vector<NumericT>      host_inner_prod_buffer(inner_prod_buffer.size());
+    vcl_size_t                 buffer_size_per_vector = inner_prod_buffer.size() / 3;
+    difference_type            buffer_offset_per_vector = static_cast<difference_type>(buffer_size_per_vector);
+
+    NumericT norm_rhs_squared = viennacl::linalg::norm_2(residual); norm_rhs_squared *= norm_rhs_squared;
+
+    if (norm_rhs_squared <= tag.abs_tolerance() * tag.abs_tolerance()) //check for early convergence of A*x = 0
+      return result;
+
+    NumericT inner_prod_rr = norm_rhs_squared;
+    NumericT alpha = inner_prod_rr / viennacl::linalg::inner_prod(p, Ap);
+    NumericT beta  = viennacl::linalg::norm_2(Ap); beta = (alpha * alpha * beta * beta - inner_prod_rr) / inner_prod_rr;
+    NumericT inner_prod_ApAp = 0;
+    NumericT inner_prod_pAp  = 0;
+
+    for (unsigned int i = 0; i < tag.max_iterations(); ++i)
+    {
+      tag.iters(i+1);
+
+      viennacl::linalg::pipelined_cg_vector_update(result, alpha, p, residual, Ap, beta, inner_prod_buffer);
+      viennacl::linalg::pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+
+      // bring back the partial results to the host:
+      viennacl::fast_copy(inner_prod_buffer.begin(), inner_prod_buffer.end(), host_inner_prod_buffer.begin());
+
+      inner_prod_rr   = std::accumulate(host_inner_prod_buffer.begin(),                                host_inner_prod_buffer.begin() +     buffer_offset_per_vector, NumericT(0));
+      inner_prod_ApAp = std::accumulate(host_inner_prod_buffer.begin() +     buffer_offset_per_vector, host_inner_prod_buffer.begin() + 2 * buffer_offset_per_vector, NumericT(0));
+      inner_prod_pAp  = std::accumulate(host_inner_prod_buffer.begin() + 2 * buffer_offset_per_vector, host_inner_prod_buffer.begin() + 3 * buffer_offset_per_vector, NumericT(0));
+
+      if (monitor && monitor(result, std::sqrt(std::fabs(inner_prod_rr / norm_rhs_squared)), monitor_data))
+        break;
+      if (std::fabs(inner_prod_rr / norm_rhs_squared) < tag.tolerance() *  tag.tolerance() || std::fabs(inner_prod_rr) < tag.abs_tolerance() * tag.abs_tolerance())    //squared norms involved here
+        break;
+
+      alpha = inner_prod_rr / inner_prod_pAp;
+      beta  = (alpha*alpha*inner_prod_ApAp - inner_prod_rr) / inner_prod_rr;
+    }
+
+    //store last error estimate:
+    tag.error(std::sqrt(std::fabs(inner_prod_rr) / norm_rhs_squared));
+
+    return result;
+  }
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::compressed_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        cg_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::coordinate_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        cg_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::ell_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        cg_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::sliced_ell_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        cg_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::hyb_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        cg_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  template<typename MatrixT, typename VectorT, typename PreconditionerT>
+  VectorT solve_impl(MatrixT const & matrix,
+                     VectorT const & rhs,
+                     cg_tag const & tag,
+                     PreconditionerT const & precond,
+                     bool (*monitor)(VectorT const &, typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<VectorT>::type>::type, void*) = NULL,
+                     void *monitor_data = NULL)
+  {
+    typedef typename viennacl::result_of::value_type<VectorT>::type           NumericType;
+    typedef typename viennacl::result_of::cpu_value_type<NumericType>::type   CPU_NumericType;
+
+    VectorT result = rhs;
+    viennacl::traits::clear(result);
+
+    VectorT residual = rhs;
+    VectorT tmp = rhs;
+    detail::z_handler<VectorT, PreconditionerT> zhandler(residual);
+    VectorT & z = zhandler.get();
+
+    precond.apply(z);
+    VectorT p = z;
+
+    CPU_NumericType ip_rr = viennacl::linalg::inner_prod(residual, z);
+    CPU_NumericType alpha;
+    CPU_NumericType new_ip_rr = 0;
+    CPU_NumericType beta;
+    CPU_NumericType norm_rhs_squared = ip_rr;
+    CPU_NumericType new_ipp_rr_over_norm_rhs;
+
+    if (norm_rhs_squared <= tag.abs_tolerance() * tag.abs_tolerance()) //solution is zero if RHS norm (squared) is zero
+      return result;
+
+    for (unsigned int i = 0; i < tag.max_iterations(); ++i)
+    {
+      tag.iters(i+1);
+      tmp = viennacl::linalg::prod(matrix, p);
+
+      alpha = ip_rr / viennacl::linalg::inner_prod(tmp, p);
+
+      result += alpha * p;
+      residual -= alpha * tmp;
+      z = residual;
+      precond.apply(z);
+
+      if (static_cast<VectorT*>(&residual)==static_cast<VectorT*>(&z))
+        new_ip_rr = std::pow(viennacl::linalg::norm_2(residual),2);
+      else
+        new_ip_rr = viennacl::linalg::inner_prod(residual, z);
+
+      new_ipp_rr_over_norm_rhs = new_ip_rr / norm_rhs_squared;
+      if (monitor && monitor(result, std::sqrt(std::fabs(new_ipp_rr_over_norm_rhs)), monitor_data))
+        break;
+      if (std::fabs(new_ipp_rr_over_norm_rhs) < tag.tolerance() *  tag.tolerance() || std::fabs(new_ip_rr) < tag.abs_tolerance() * tag.abs_tolerance())    //squared norms involved here
+        break;
+
+      beta = new_ip_rr / ip_rr;
+      ip_rr = new_ip_rr;
+
+      p = z + beta*p;
+    }
+
+    //store last error estimate:
+    tag.error(std::sqrt(std::fabs(new_ip_rr / norm_rhs_squared)));
+
+    return result;
+  }
+
+}
+
+
+
+/** @brief Implementation of the preconditioned conjugate gradient solver, generic implementation for non-ViennaCL types.
+*
+* Following Algorithm 9.1 in "Iterative Methods for Sparse Linear Systems" by Y. Saad
+*
+* @param matrix     The system matrix
+* @param rhs        The load vector
+* @param tag        Solver configuration tag
+* @param precond    A preconditioner. Precondition operation is done via member function apply()
+* @return The result vector
+*/
+template<typename MatrixT, typename VectorT, typename PreconditionerT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, cg_tag const & tag, PreconditionerT const & precond)
+{
+  return detail::solve_impl(matrix, rhs, tag, precond);
+}
+
+/** @brief Convenience overload for calling the CG solver using types from the C++ STL.
+  *
+  * A std::vector<std::map<T, U> > matrix is convenient for e.g. finite element assembly.
+  * It is not the fastest option for setting up a system, but often it is fast enough - particularly for just trying things out.
+  */
+template<typename IndexT, typename NumericT, typename PreconditionerT>
+std::vector<NumericT> solve(std::vector< std::map<IndexT, NumericT> > const & A, std::vector<NumericT> const & rhs, cg_tag const & tag, PreconditionerT const & precond)
+{
+  viennacl::compressed_matrix<NumericT> vcl_A;
+  viennacl::copy(A, vcl_A);
+
+  viennacl::vector<NumericT> vcl_rhs(rhs.size());
+  viennacl::copy(rhs, vcl_rhs);
+
+  viennacl::vector<NumericT> vcl_result = solve(vcl_A, vcl_rhs, tag, precond);
+
+  std::vector<NumericT> result(vcl_result.size());
+  viennacl::copy(vcl_result, result);
+  return result;
+}
+
+/** @brief Entry point for the unpreconditioned CG method.
+ *
+ *  @param matrix    The system matrix
+ *  @param rhs       Right hand side vector (load vector)
+ *  @param tag       A BiCGStab tag providing relative tolerances, etc.
+ */
+template<typename MatrixT, typename VectorT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, cg_tag const & tag)
+{
+  return solve(matrix, rhs, tag, viennacl::linalg::no_precond());
+}
+
+
+
+template<typename VectorT>
+class cg_solver
+{
+public:
+  typedef typename viennacl::result_of::cpu_value_type<VectorT>::type   numeric_type;
+
+  cg_solver(cg_tag const & tag) : tag_(tag), monitor_callback_(NULL), user_data_(NULL) {}
+
+  template<typename MatrixT, typename PreconditionerT>
+  VectorT operator()(MatrixT const & A, VectorT const & b, PreconditionerT const & precond) const
+  {
+    if (viennacl::traits::size(init_guess_) > 0) // take initial guess into account
+    {
+      VectorT mod_rhs = viennacl::linalg::prod(A, init_guess_);
+      mod_rhs = b - mod_rhs;
+      VectorT y = detail::solve_impl(A, mod_rhs, tag_, precond, monitor_callback_, user_data_);
+      return init_guess_ + y;
+    }
+    return detail::solve_impl(A, b, tag_, precond, monitor_callback_, user_data_);
+  }
+
+
+  template<typename MatrixT>
+  VectorT operator()(MatrixT const & A, VectorT const & b) const
+  {
+    return operator()(A, b, viennacl::linalg::no_precond());
+  }
+
+  /** @brief Specifies an initial guess for the iterative solver.
+    *
+    * An iterative solver for Ax = b with initial guess x_0 is equivalent to an iterative solver for Ay = b' := b - Ax_0, where x = x_0 + y.
+    */
+  void set_initial_guess(VectorT const & x) { init_guess_ = x; }
+
+  /** @brief Sets a monitor function pointer to be called in each iteration. Set to NULL to run without monitor.
+   *
+   *  The monitor function is called with the current guess for the result as first argument and the current relative residual estimate as second argument.
+   *  The third argument is a pointer to user-defined data, through which additional information can be passed.
+   *  This pointer needs to be set with set_monitor_data. If not set, NULL is passed.
+   *  If the montior function returns true, the solver terminates (either convergence or divergence).
+   */
+  void set_monitor(bool (*monitor_fun)(VectorT const &, numeric_type, void *), void *user_data)
+  {
+    monitor_callback_ = monitor_fun;
+    user_data_ = user_data;
+  }
+
+  /** @brief Returns the solver tag containing basic configuration such as tolerances, etc. */
+  cg_tag const & tag() const { return tag_; }
+
+private:
+  cg_tag   tag_;
+  VectorT  init_guess_;
+  bool     (*monitor_callback_)(VectorT const &, numeric_type, void *);
+  void     *user_data_;
+};
+
+
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/circulant_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/circulant_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/circulant_matrix_operations.hpp
new file mode 100644
index 0000000..5325b7b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/circulant_matrix_operations.hpp
@@ -0,0 +1,75 @@
+#ifndef VIENNACL_LINALG_CIRCULANT_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CIRCULANT_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/circulant_matrix_operations.hpp
+    @brief Implementations of operations using circulant_matrix. Experimental.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/fft.hpp"
+//#include "viennacl/linalg/kernels/coordinate_matrix_kernels.h"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+// A * x
+
+/** @brief Carries out matrix-vector multiplication with a circulant_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::circulant_matrix<NumericT, AlignmentV> const & mat,
+               viennacl::vector_base<NumericT> const & vec,
+               viennacl::vector_base<NumericT>       & result)
+{
+  assert(mat.size1() == result.size() && bool("Dimension mismatch"));
+  assert(mat.size2() == vec.size() && bool("Dimension mismatch"));
+  //result.clear();
+
+  //std::cout << "prod(circulant_matrix" << ALIGNMENT << ", vector) called with internal_nnz=" << mat.internal_nnz() << std::endl;
+
+  viennacl::vector<NumericT> circ(mat.elements().size() * 2);
+  viennacl::linalg::real_to_complex(mat.elements(), circ, mat.elements().size());
+
+  viennacl::vector<NumericT> tmp(vec.size() * 2);
+  viennacl::vector<NumericT> tmp2(vec.size() * 2);
+
+  viennacl::linalg::real_to_complex(vec, tmp, vec.size());
+  viennacl::linalg::convolve(circ, tmp, tmp2);
+  viennacl::linalg::complex_to_real(tmp2, result, vec.size());
+
+}
+
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif


[42/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/utils.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/utils.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/utils.hpp
new file mode 100644
index 0000000..67d089a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/utils.hpp
@@ -0,0 +1,105 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_REDUCTION_UTILS_HPP
+#define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_REDUCTION_UTILS_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/templates/utils.hpp
+ *
+ * A collection of utilities for the device specific execution templates.
+*/
+
+#include <vector>
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/device_specific/tree_parsing.hpp"
+#include "viennacl/device_specific/utils.hpp"
+
+
+namespace viennacl
+{
+namespace device_specific
+{
+
+inline void compute_reduction(utils::kernel_generation_stream & os, std::string acc, std::string cur, scheduler::op_element const & op)
+{
+  if (utils::elementwise_function(op))
+    os << acc << "=" << tree_parsing::evaluate(op.type) << "(" << acc << "," << cur << ");" << std::endl;
+  else
+    os << acc << "= (" << acc << ")" << tree_parsing::evaluate(op.type)  << "(" << cur << ");" << std::endl;
+}
+
+inline void compute_index_reduction(utils::kernel_generation_stream & os, std::string acc, std::string cur, std::string const & acc_value, std::string const & cur_value, scheduler::op_element const & op)
+{
+  //        os << acc << " = " << cur_value << ">" << acc_value  << "?" << cur << ":" << acc << ";" << std::endl;
+  os << acc << "= select(" << acc << "," << cur << "," << cur_value << ">" << acc_value << ");" << std::endl;
+  os << acc_value << "=";
+  if (op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE) os << "fmax";
+  if (op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGMAX_TYPE) os << "max";
+  if (op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE) os << "fmin";
+  if (op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGMIN_TYPE) os << "min";
+  os << "(" << acc_value << "," << cur_value << ");"<< std::endl;
+}
+
+inline void process_all(std::string const & type_key, std::string const & str,
+                        utils::kernel_generation_stream & stream, std::vector<mapping_type> const & mappings)
+{
+  for (std::vector<mapping_type>::const_iterator mit = mappings.begin(); mit != mappings.end(); ++mit)
+    for (mapping_type::const_iterator mmit = mit->begin(); mmit != mit->end(); ++mmit)
+      if (mmit->second->type_key()==type_key)
+        stream << mmit->second->process(str) << std::endl;
+}
+
+
+inline void process_all_at(std::string const & type_key, std::string const & str,
+                           utils::kernel_generation_stream & stream, std::vector<mapping_type> const & mappings,
+                           vcl_size_t root_idx, leaf_t leaf)
+{
+  for (std::vector<mapping_type>::const_iterator mit = mappings.begin(); mit != mappings.end(); ++mit)
+  {
+    mapped_object * obj = at(*mit, mapping_key(root_idx, leaf)).get();
+    if (obj->type_key()==type_key)
+      stream << obj->process(str) << std::endl;
+  }
+}
+
+inline std::string neutral_element(scheduler::op_element const & op)
+{
+  switch (op.type)
+  {
+  case scheduler::OPERATION_BINARY_ADD_TYPE : return "0";
+  case scheduler::OPERATION_BINARY_MULT_TYPE : return "1";
+  case scheduler::OPERATION_BINARY_DIV_TYPE : return "1";
+  case scheduler::OPERATION_BINARY_ELEMENT_FMAX_TYPE : return "-INFINITY";
+  case scheduler::OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE : return "-INFINITY";
+  case scheduler::OPERATION_BINARY_ELEMENT_MAX_TYPE : return "-INFINITY";
+  case scheduler::OPERATION_BINARY_ELEMENT_ARGMAX_TYPE : return "-INFINITY";
+  case scheduler::OPERATION_BINARY_ELEMENT_FMIN_TYPE : return "INFINITY";
+  case scheduler::OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE : return "INFINITY";
+  case scheduler::OPERATION_BINARY_ELEMENT_MIN_TYPE : return "INFINITY";
+  case scheduler::OPERATION_BINARY_ELEMENT_ARGMIN_TYPE : return "INFINITY";
+
+  default: throw generator_not_supported_exception("Unsupported reduction operator : no neutral element known");
+  }
+}
+
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/tree_parsing.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/tree_parsing.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/tree_parsing.hpp
new file mode 100644
index 0000000..f9cc8a8
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/tree_parsing.hpp
@@ -0,0 +1,512 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_TREE_PARSING_HPP
+#define VIENNACL_DEVICE_SPECIFIC_TREE_PARSING_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/tree_parsing.hpp
+    @brief Code for parsing the expression trees.
+*/
+
+#include <set>
+
+#include "viennacl/forwards.h"
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/utils.hpp"
+#include "viennacl/device_specific/mapped_objects.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+namespace tree_parsing
+{
+
+/** @brief base functor class for traversing a statement */
+class traversal_functor
+{
+public:
+  void call_before_expansion(scheduler::statement const &, vcl_size_t) const { }
+  void call_after_expansion(scheduler::statement const &, vcl_size_t) const { }
+};
+
+/** @brief Recursively execute a functor on a statement */
+template<class Fun>
+inline void traverse(scheduler::statement const & statement, vcl_size_t root_idx, Fun const & fun, bool inspect)
+{
+  scheduler::statement_node const & root_node = statement.array()[root_idx];
+  bool recurse = utils::node_leaf(root_node.op)?inspect:true;
+
+  fun.call_before_expansion(statement, root_idx);
+
+  //Lhs:
+  if (recurse)
+  {
+    if (root_node.lhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+      traverse(statement, root_node.lhs.node_index, fun, inspect);
+    if (root_node.lhs.type_family != scheduler::INVALID_TYPE_FAMILY)
+      fun(statement, root_idx, LHS_NODE_TYPE);
+  }
+
+  //Self:
+  fun(statement, root_idx, PARENT_NODE_TYPE);
+
+  //Rhs:
+  if (recurse && root_node.rhs.type_family!=scheduler::INVALID_TYPE_FAMILY)
+  {
+    if (root_node.rhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+      traverse(statement, root_node.rhs.node_index, fun, inspect);
+    if (root_node.rhs.type_family != scheduler::INVALID_TYPE_FAMILY)
+      fun(statement, root_idx, RHS_NODE_TYPE);
+  }
+
+  fun.call_after_expansion(statement, root_idx);
+}
+
+class filter : public traversal_functor
+{
+public:
+  typedef bool (*pred_t)(scheduler::statement_node const & node);
+
+  filter(pred_t pred, std::vector<vcl_size_t> & out) : pred_(pred), out_(out){ }
+
+  void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t) const
+  {
+    scheduler::statement_node const * root_node = &statement.array()[root_idx];
+    if (pred_(*root_node))
+      out_.push_back(root_idx);
+  }
+private:
+  pred_t pred_;
+  std::vector<vcl_size_t> & out_;
+};
+
+class filter_elements : public traversal_functor
+{
+public:
+  filter_elements(scheduler::statement_node_subtype subtype, std::vector<scheduler::lhs_rhs_element> & out) : subtype_(subtype), out_(out) { }
+
+  void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t) const
+  {
+    scheduler::statement_node const * root_node = &statement.array()[root_idx];
+    if (root_node->lhs.subtype==subtype_)
+      out_.push_back(root_node->lhs);
+    if (root_node->rhs.subtype==subtype_)
+      out_.push_back(root_node->rhs);
+  }
+private:
+  scheduler::statement_node_subtype subtype_;
+  std::vector<scheduler::lhs_rhs_element> & out_;
+};
+
+/** @brief generate a string from an operation_node_type */
+inline const char * evaluate(scheduler::operation_node_type type)
+{
+  using namespace scheduler;
+  // unary expression
+  switch (type)
+  {
+  //Function
+  case OPERATION_UNARY_ABS_TYPE : return "abs";
+  case OPERATION_UNARY_ACOS_TYPE : return "acos";
+  case OPERATION_UNARY_ASIN_TYPE : return "asin";
+  case OPERATION_UNARY_ATAN_TYPE : return "atan";
+  case OPERATION_UNARY_CEIL_TYPE : return "ceil";
+  case OPERATION_UNARY_COS_TYPE : return "cos";
+  case OPERATION_UNARY_COSH_TYPE : return "cosh";
+  case OPERATION_UNARY_EXP_TYPE : return "exp";
+  case OPERATION_UNARY_FABS_TYPE : return "fabs";
+  case OPERATION_UNARY_FLOOR_TYPE : return "floor";
+  case OPERATION_UNARY_LOG_TYPE : return "log";
+  case OPERATION_UNARY_LOG10_TYPE : return "log10";
+  case OPERATION_UNARY_SIN_TYPE : return "sin";
+  case OPERATION_UNARY_SINH_TYPE : return "sinh";
+  case OPERATION_UNARY_SQRT_TYPE : return "sqrt";
+  case OPERATION_UNARY_TAN_TYPE : return "tan";
+  case OPERATION_UNARY_TANH_TYPE : return "tanh";
+
+  case OPERATION_UNARY_CAST_CHAR_TYPE : return "(char)";
+  case OPERATION_UNARY_CAST_UCHAR_TYPE : return "(uchar)";
+  case OPERATION_UNARY_CAST_SHORT_TYPE : return "(short)";
+  case OPERATION_UNARY_CAST_USHORT_TYPE : return "(ushort)";
+  case OPERATION_UNARY_CAST_INT_TYPE : return "(int)";
+  case OPERATION_UNARY_CAST_UINT_TYPE : return "(uint)";
+  case OPERATION_UNARY_CAST_LONG_TYPE : return "(long)";
+  case OPERATION_UNARY_CAST_ULONG_TYPE : return "(ulong)";
+  case OPERATION_UNARY_CAST_HALF_TYPE : return "(half)";
+  case OPERATION_UNARY_CAST_FLOAT_TYPE : return "(float)";
+  case OPERATION_UNARY_CAST_DOUBLE_TYPE : return "(double)";
+
+  case OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE : return "argfmax";
+  case OPERATION_BINARY_ELEMENT_ARGMAX_TYPE : return "argmax";
+  case OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE : return "argfmin";
+  case OPERATION_BINARY_ELEMENT_ARGMIN_TYPE : return "argmin";
+  case OPERATION_BINARY_ELEMENT_POW_TYPE : return "pow";
+
+    //Arithmetic
+  case OPERATION_UNARY_MINUS_TYPE : return "-";
+  case OPERATION_BINARY_ASSIGN_TYPE : return "=";
+  case OPERATION_BINARY_INPLACE_ADD_TYPE : return "+=";
+  case OPERATION_BINARY_INPLACE_SUB_TYPE : return "-=";
+  case OPERATION_BINARY_ADD_TYPE : return "+";
+  case OPERATION_BINARY_SUB_TYPE : return "-";
+  case OPERATION_BINARY_MULT_TYPE : return "*";
+  case OPERATION_BINARY_ELEMENT_PROD_TYPE : return "*";
+  case OPERATION_BINARY_DIV_TYPE : return "/";
+  case OPERATION_BINARY_ELEMENT_DIV_TYPE : return "/";
+  case OPERATION_BINARY_ACCESS_TYPE : return "[]";
+
+    //Relational
+  case OPERATION_BINARY_ELEMENT_EQ_TYPE : return "isequal";
+  case OPERATION_BINARY_ELEMENT_NEQ_TYPE : return "isnotequal";
+  case OPERATION_BINARY_ELEMENT_GREATER_TYPE : return "isgreater";
+  case OPERATION_BINARY_ELEMENT_GEQ_TYPE : return "isgreaterequal";
+  case OPERATION_BINARY_ELEMENT_LESS_TYPE : return "isless";
+  case OPERATION_BINARY_ELEMENT_LEQ_TYPE : return "islessequal";
+
+  case OPERATION_BINARY_ELEMENT_FMAX_TYPE : return "fmax";
+  case OPERATION_BINARY_ELEMENT_FMIN_TYPE : return "fmin";
+  case OPERATION_BINARY_ELEMENT_MAX_TYPE : return "max";
+  case OPERATION_BINARY_ELEMENT_MIN_TYPE : return "min";
+    //Unary
+  case OPERATION_UNARY_TRANS_TYPE : return "trans";
+
+    //Binary
+  case OPERATION_BINARY_INNER_PROD_TYPE : return "iprod";
+  case OPERATION_BINARY_MAT_MAT_PROD_TYPE : return "mmprod";
+  case OPERATION_BINARY_MAT_VEC_PROD_TYPE : return "mvprod";
+  case OPERATION_BINARY_VECTOR_DIAG_TYPE : return "vdiag";
+  case OPERATION_BINARY_MATRIX_DIAG_TYPE : return "mdiag";
+  case OPERATION_BINARY_MATRIX_ROW_TYPE : return "row";
+  case OPERATION_BINARY_MATRIX_COLUMN_TYPE : return "col";
+
+  default : throw generator_not_supported_exception("Unsupported operator");
+  }
+}
+
+inline const char * operator_string(scheduler::operation_node_type type)
+{
+  using namespace scheduler;
+  switch (type)
+  {
+  case OPERATION_UNARY_CAST_CHAR_TYPE : return "char";
+  case OPERATION_UNARY_CAST_UCHAR_TYPE : return "uchar";
+  case OPERATION_UNARY_CAST_SHORT_TYPE : return "short";
+  case OPERATION_UNARY_CAST_USHORT_TYPE : return "ushort";
+  case OPERATION_UNARY_CAST_INT_TYPE : return "int";
+  case OPERATION_UNARY_CAST_UINT_TYPE : return "uint";
+  case OPERATION_UNARY_CAST_LONG_TYPE : return "long";
+  case OPERATION_UNARY_CAST_ULONG_TYPE : return "ulong";
+  case OPERATION_UNARY_CAST_HALF_TYPE : return "half";
+  case OPERATION_UNARY_CAST_FLOAT_TYPE : return "float";
+  case OPERATION_UNARY_CAST_DOUBLE_TYPE : return "double";
+
+  case OPERATION_UNARY_MINUS_TYPE : return "umin";
+  case OPERATION_BINARY_ASSIGN_TYPE : return "assign";
+  case OPERATION_BINARY_INPLACE_ADD_TYPE : return "ip_add";
+  case OPERATION_BINARY_INPLACE_SUB_TYPE : return "ip_sub";
+  case OPERATION_BINARY_ADD_TYPE : return "add";
+  case OPERATION_BINARY_SUB_TYPE : return "sub";
+  case OPERATION_BINARY_MULT_TYPE : return "mult";
+  case OPERATION_BINARY_ELEMENT_PROD_TYPE : return "eprod";
+  case OPERATION_BINARY_DIV_TYPE : return "div";
+  case OPERATION_BINARY_ELEMENT_DIV_TYPE : return "ediv";
+  case OPERATION_BINARY_ACCESS_TYPE : return "acc";
+  default : return evaluate(type);
+  }
+}
+
+/** @brief functor for generating the expression string from a statement */
+class evaluate_expression_traversal: public tree_parsing::traversal_functor
+{
+private:
+  std::map<std::string, std::string> const & accessors_;
+  std::string & str_;
+  mapping_type const & mapping_;
+
+public:
+  evaluate_expression_traversal(std::map<std::string, std::string> const & accessors, std::string & str, mapping_type const & mapping) : accessors_(accessors), str_(str), mapping_(mapping){ }
+
+  void call_before_expansion(scheduler::statement const & statement, vcl_size_t root_idx) const
+  {
+    scheduler::statement_node const & root_node = statement.array()[root_idx];
+    if ((root_node.op.type_family==scheduler::OPERATION_UNARY_TYPE_FAMILY || utils::elementwise_function(root_node.op))
+        && !utils::node_leaf(root_node.op))
+      str_+=tree_parsing::evaluate(root_node.op.type);
+    str_+="(";
+
+  }
+
+  void call_after_expansion(scheduler::statement const & /*statement*/, vcl_size_t /*root_idx*/) const
+  {
+    str_+=")";
+  }
+
+  void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t leaf) const
+  {
+    scheduler::statement_node const & root_node = statement.array()[root_idx];
+    mapping_type::key_type key = std::make_pair(root_idx, leaf);
+    if (leaf==PARENT_NODE_TYPE)
+    {
+      if (utils::node_leaf(root_node.op))
+        str_ += at(mapping_, key)->evaluate(accessors_);
+      else if (utils::elementwise_operator(root_node.op))
+        str_ += tree_parsing::evaluate(root_node.op.type);
+      else if (root_node.op.type_family!=scheduler::OPERATION_UNARY_TYPE_FAMILY && utils::elementwise_function(root_node.op))
+        str_ += ",";
+    }
+    else
+    {
+      if (leaf==LHS_NODE_TYPE)
+      {
+        if (root_node.lhs.type_family!=scheduler::COMPOSITE_OPERATION_FAMILY)
+          str_ += at(mapping_, key)->evaluate(accessors_);
+      }
+
+      if (leaf==RHS_NODE_TYPE)
+      {
+        if (root_node.rhs.type_family!=scheduler::COMPOSITE_OPERATION_FAMILY)
+          str_ += at(mapping_, key)->evaluate(accessors_);
+      }
+    }
+  }
+};
+
+inline std::string evaluate(leaf_t leaf, std::map<std::string, std::string> const & accessors,
+                            scheduler::statement const & statement, vcl_size_t root_idx, mapping_type const & mapping)
+{
+  std::string res;
+  evaluate_expression_traversal traversal_functor(accessors, res, mapping);
+  scheduler::statement_node const & root_node = statement.array()[root_idx];
+
+  if (leaf==RHS_NODE_TYPE)
+  {
+    if (root_node.rhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+      tree_parsing::traverse(statement, root_node.rhs.node_index, traversal_functor, false);
+    else
+      traversal_functor(statement, root_idx, leaf);
+  }
+  else if (leaf==LHS_NODE_TYPE)
+  {
+    if (root_node.lhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+      tree_parsing::traverse(statement, root_node.lhs.node_index, traversal_functor, false);
+    else
+      traversal_functor(statement, root_idx, leaf);
+  }
+  else
+    tree_parsing::traverse(statement, root_idx, traversal_functor, false);
+
+  return res;
+}
+
+inline void evaluate(utils::kernel_generation_stream & stream, leaf_t leaf, std::map<std::string, std::string> const & accessors,
+                     statements_container const & statements, std::vector<mapping_type> const & mappings)
+{
+  statements_container::data_type::const_iterator sit;
+  std::vector<mapping_type>::const_iterator mit;
+
+  for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++mit, ++sit)
+    stream << evaluate(leaf, accessors, *sit, sit->root(), *mit) << ";" << std::endl;
+}
+
+
+/** @brief functor for fetching or writing-back the elements in a statement */
+class process_traversal : public tree_parsing::traversal_functor
+{
+public:
+  process_traversal(std::string const & type_key, std::string const & to_process, utils::kernel_generation_stream & stream,
+                    mapping_type const & mapping, std::set<std::string> & already_processed) : type_key_(type_key), to_process_(to_process),  stream_(stream), mapping_(mapping), already_processed_(already_processed){ }
+
+  void operator()(scheduler::statement const & /*statement*/, vcl_size_t root_idx, leaf_t leaf) const
+  {
+    mapping_type::const_iterator it = mapping_.find(std::make_pair(root_idx, leaf));
+    if (it!=mapping_.end())
+    {
+      mapped_object * obj = it->second.get();
+      if (obj->type_key()==type_key_)
+      {
+        if (already_processed_.insert(obj->process("#name")).second)
+          stream_ << obj->process(to_process_) << std::endl;
+      }
+    }
+  }
+
+private:
+  std::string const & type_key_;
+  std::string const & to_process_;
+  utils::kernel_generation_stream & stream_;
+  mapping_type const & mapping_;
+  std::set<std::string> & already_processed_;
+};
+
+inline void process(utils::kernel_generation_stream & stream, leaf_t leaf, std::string const & type_key, std::string const & to_process,
+                    scheduler::statement const & statement, vcl_size_t root_idx, mapping_type const & mapping, std::set<std::string> & already_processed)
+{
+  process_traversal traversal_functor(type_key, to_process, stream, mapping, already_processed);
+  scheduler::statement_node const & root_node = statement.array()[root_idx];
+
+  if (leaf==RHS_NODE_TYPE)
+  {
+    if (root_node.rhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+      tree_parsing::traverse(statement, root_node.rhs.node_index, traversal_functor, true);
+    else
+      traversal_functor(statement, root_idx, leaf);
+  }
+  else if (leaf==LHS_NODE_TYPE)
+  {
+    if (root_node.lhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+      tree_parsing::traverse(statement, root_node.lhs.node_index, traversal_functor, true);
+    else
+      traversal_functor(statement, root_idx, leaf);
+  }
+  else
+  {
+    tree_parsing::traverse(statement, root_idx, traversal_functor, true);
+  }
+}
+
+inline void process(utils::kernel_generation_stream & stream, leaf_t leaf, std::string const & type_key, std::string const & to_process,
+                    statements_container const & statements, std::vector<mapping_type> const & mappings)
+{
+  statements_container::data_type::const_iterator sit;
+  std::vector<mapping_type>::const_iterator mit;
+  std::set<std::string> already_processed;
+
+  for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++mit, ++sit)
+    process(stream, leaf, type_key, to_process, *sit, sit->root(), *mit, already_processed);
+}
+
+
+class statement_representation_functor : public traversal_functor{
+private:
+  static void append_id(char * & ptr, unsigned int val)
+  {
+    if (val==0)
+      *ptr++='0';
+    else
+      while (val>0)
+      {
+        *ptr++= (char)('0' + (val % 10));
+        val /= 10;
+      }
+  }
+
+public:
+  typedef void result_type;
+
+  statement_representation_functor(symbolic_binder & binder, char *& ptr) : binder_(binder), ptr_(ptr){ }
+
+  template<class NumericT>
+  inline result_type operator()(NumericT const & /*scal*/) const
+  {
+    *ptr_++='h'; //host
+    *ptr_++='s'; //scalar
+    *ptr_++=utils::first_letter_of_type<NumericT>::value();
+  }
+
+  /** @brief Scalar mapping */
+  template<class NumericT>
+  inline result_type operator()(scalar<NumericT> const & scal) const
+  {
+    *ptr_++='s'; //scalar
+    *ptr_++=utils::first_letter_of_type<NumericT>::value();
+    append_id(ptr_, binder_.get(&traits::handle(scal)));
+  }
+
+  /** @brief Vector mapping */
+  template<class NumericT>
+  inline result_type operator()(vector_base<NumericT> const & vec) const
+  {
+    *ptr_++='v'; //vector
+    *ptr_++=utils::first_letter_of_type<NumericT>::value();
+    append_id(ptr_, binder_.get(&traits::handle(vec)));
+  }
+
+  /** @brief Implicit vector mapping */
+  template<class NumericT>
+  inline result_type operator()(implicit_vector_base<NumericT> const & /*vec*/) const
+  {
+    *ptr_++='i'; //implicit
+    *ptr_++='v'; //vector
+    *ptr_++=utils::first_letter_of_type<NumericT>::value();
+  }
+
+  /** @brief Matrix mapping */
+  template<class NumericT>
+  inline result_type operator()(matrix_base<NumericT> const & mat) const
+  {
+    *ptr_++='m'; //Matrix
+    *ptr_++=mat.row_major()?'r':'c';
+    *ptr_++=utils::first_letter_of_type<NumericT>::value();
+    append_id(ptr_, binder_.get(&traits::handle(mat)));
+  }
+
+  /** @brief Implicit matrix mapping */
+  template<class NumericT>
+  inline result_type operator()(implicit_matrix_base<NumericT> const & /*mat*/) const
+  {
+    *ptr_++='i'; //implicit
+    *ptr_++='m'; //matrix
+    *ptr_++=utils::first_letter_of_type<NumericT>::value();
+  }
+
+  static inline void append(char*& p, const char * str)
+  {
+    vcl_size_t n = std::strlen(str);
+    std::memcpy(p, str, n);
+    p+=n;
+  }
+
+  inline void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t leaf_t) const
+  {
+    scheduler::statement_node const & root_node = statement.array()[root_idx];
+    if (leaf_t==LHS_NODE_TYPE && root_node.lhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+      utils::call_on_element(root_node.lhs, *this);
+    else if (root_node.op.type_family==scheduler::OPERATION_BINARY_TYPE_FAMILY && leaf_t==RHS_NODE_TYPE && root_node.rhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+      utils::call_on_element(root_node.rhs, *this);
+    else if (leaf_t==PARENT_NODE_TYPE)
+      append_id(ptr_,root_node.op.type);
+  }
+
+private:
+  symbolic_binder & binder_;
+  char *& ptr_;
+};
+
+inline std::string statements_representation(statements_container const & statements, binding_policy_t binding_policy)
+{
+  std::vector<char> program_name_vector(256);
+  char* program_name = &(program_name_vector[0]);
+  if (statements.order()==statements_container::INDEPENDENT)
+    *program_name++='i';
+  else
+    *program_name++='s';
+  tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy);
+  for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
+    tree_parsing::traverse(*it, it->root(), tree_parsing::statement_representation_functor(*binder, program_name),true);
+  *program_name='\0';
+  return std::string(&(program_name_vector[0]));
+}
+
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/utils.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/utils.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/utils.hpp
new file mode 100644
index 0000000..1f1fc60
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/utils.hpp
@@ -0,0 +1,568 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_UTILS_HPP
+#define VIENNACL_DEVICE_SPECIFIC_UTILS_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/utils.hpp
+    @brief Internal utils
+*/
+
+#include <sstream>
+
+#include "viennacl/detail/matrix_def.hpp"
+#include "viennacl/detail/vector_def.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/ocl/forwards.h"
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/row_major.hpp"
+
+#include "viennacl/tools/tools.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+namespace utils
+{
+
+//CUDA Conversion
+inline std::string opencl_source_to_cuda_source(std::string const & opencl_src)
+{
+  std::string res = opencl_src;
+
+  viennacl::tools::find_and_replace(res,"__attribute__","//__attribute__");
+
+  //Pointer
+  viennacl::tools::find_and_replace(res, "__global float*", "float*");
+  viennacl::tools::find_and_replace(res, "__local float*", "float*");
+
+  viennacl::tools::find_and_replace(res, "__global double*", "double*");
+  viennacl::tools::find_and_replace(res, "__local double*", "double*");
+
+  //Qualifiers
+  viennacl::tools::find_and_replace(res,"__global","__device__");
+  viennacl::tools::find_and_replace(res,"__kernel","__global__");
+  viennacl::tools::find_and_replace(res,"__constant","__constant__");
+  viennacl::tools::find_and_replace(res,"__local","__shared__");
+
+  //Indexing
+  viennacl::tools::find_and_replace(res,"get_num_groups(0)","gridDim.x");
+  viennacl::tools::find_and_replace(res,"get_num_groups(1)","gridDim.y");
+
+  viennacl::tools::find_and_replace(res,"get_local_size(0)","blockDim.x");
+  viennacl::tools::find_and_replace(res,"get_local_size(1)","blockDim.y");
+
+  viennacl::tools::find_and_replace(res,"get_group_id(0)","blockIdx.x");
+  viennacl::tools::find_and_replace(res,"get_group_id(1)","blockIdx.y");
+
+  viennacl::tools::find_and_replace(res,"get_local_id(0)","threadIdx.x");
+  viennacl::tools::find_and_replace(res,"get_local_id(1)","threadIdx.y");
+
+  viennacl::tools::find_and_replace(res,"get_global_id(0)","(blockIdx.x*blockDim.x + threadIdx.x)");
+  viennacl::tools::find_and_replace(res,"get_global_id(1)","(blockIdx.y*blockDim.y + threadIdx.y)");
+
+  //Synchronization
+  viennacl::tools::find_and_replace(res,"barrier(CLK_LOCAL_MEM_FENCE)","__syncthreads()");
+  viennacl::tools::find_and_replace(res,"barrier(CLK_GLOBAL_MEM_FENCE)","__syncthreads()");
+
+
+  return res;
+}
+
+static std::string numeric_type_to_string(scheduler::statement_node_numeric_type const & type){
+  switch (type)
+  {
+  //case scheduler::CHAR_TYPE: return "char";
+  //case scheduler::UCHAR_TYPE: return "unsigned char";
+  //case scheduler::SHORT_TYPE: return "short";
+  //case scheduler::USHORT_TYPE: return "unsigned short";
+  case scheduler::INT_TYPE:  return "int";
+  case scheduler::UINT_TYPE: return "unsigned int";
+  case scheduler::LONG_TYPE:  return "long";
+  case scheduler::ULONG_TYPE: return "unsigned long";
+  case scheduler::FLOAT_TYPE : return "float";
+  case scheduler::DOUBLE_TYPE : return "double";
+  default : throw generator_not_supported_exception("Unsupported Scalartype");
+  }
+}
+
+
+template<class Fun>
+static typename Fun::result_type call_on_host_scalar(scheduler::lhs_rhs_element element, Fun const & fun){
+  assert(element.type_family == scheduler::SCALAR_TYPE_FAMILY && bool("Must be called on a host scalar"));
+  switch (element.numeric_type)
+  {
+  //case scheduler::CHAR_TYPE: return fun(element.host_char);
+  //case scheduler::UCHAR_TYPE: return fun(element.host_uchar);
+  //case scheduler::SHORT_TYPE: return fun(element.host_short);
+  //case scheduler::USHORT_TYPE: return fun(element.host_ushort);
+  case scheduler::INT_TYPE:  return fun(element.host_int);
+  case scheduler::UINT_TYPE: return fun(element.host_uint);
+  case scheduler::LONG_TYPE:  return fun(element.host_long);
+  case scheduler::ULONG_TYPE: return fun(element.host_ulong);
+  case scheduler::FLOAT_TYPE : return fun(element.host_float);
+  case scheduler::DOUBLE_TYPE : return fun(element.host_double);
+  default : throw generator_not_supported_exception("Unsupported Scalartype");
+  }
+}
+
+template<class Fun>
+static typename Fun::result_type call_on_scalar(scheduler::lhs_rhs_element element, Fun const & fun){
+  assert(element.type_family == scheduler::SCALAR_TYPE_FAMILY && bool("Must be called on a scalar"));
+  switch (element.numeric_type)
+  {
+  //case scheduler::CHAR_TYPE: return fun(*element.scalar_char);
+  //case scheduler::UCHAR_TYPE: return fun(*element.scalar_uchar);
+  //case scheduler::SHORT_TYPE: return fun(*element.scalar_short);
+  //case scheduler::USHORT_TYPE: return fun(*element.scalar_ushort);
+  case scheduler::INT_TYPE:  return fun(*element.scalar_int);
+  case scheduler::UINT_TYPE: return fun(*element.scalar_uint);
+  case scheduler::LONG_TYPE:  return fun(*element.scalar_long);
+  case scheduler::ULONG_TYPE: return fun(*element.scalar_ulong);
+  case scheduler::FLOAT_TYPE : return fun(*element.scalar_float);
+  case scheduler::DOUBLE_TYPE : return fun(*element.scalar_double);
+  default : throw generator_not_supported_exception("Unsupported Scalartype");
+  }
+}
+
+template<class Fun>
+static typename Fun::result_type call_on_vector(scheduler::lhs_rhs_element element, Fun const & fun){
+  assert(element.type_family == scheduler::VECTOR_TYPE_FAMILY && bool("Must be called on a vector"));
+  switch (element.numeric_type)
+  {
+  //case scheduler::CHAR_TYPE: return fun(*element.vector_char);
+  //case scheduler::UCHAR_TYPE: return fun(*element.vector_uchar);
+  //case scheduler::SHORT_TYPE: return fun(*element.vector_short);
+  //case scheduler::USHORT_TYPE: return fun(*element.vector_ushort);
+  case scheduler::INT_TYPE:  return fun(*element.vector_int);
+  case scheduler::UINT_TYPE: return fun(*element.vector_uint);
+  case scheduler::LONG_TYPE:  return fun(*element.vector_long);
+  case scheduler::ULONG_TYPE: return fun(*element.vector_ulong);
+  case scheduler::FLOAT_TYPE : return fun(*element.vector_float);
+  case scheduler::DOUBLE_TYPE : return fun(*element.vector_double);
+  default : throw generator_not_supported_exception("Unsupported Scalartype");
+  }
+}
+
+template<class Fun>
+static typename Fun::result_type call_on_implicit_vector(scheduler::lhs_rhs_element element, Fun const & fun){
+  assert(element.type_family == scheduler::VECTOR_TYPE_FAMILY   && bool("Must be called on a implicit_vector"));
+  assert(element.subtype     == scheduler::IMPLICIT_VECTOR_TYPE && bool("Must be called on a implicit_vector"));
+  switch (element.numeric_type)
+  {
+  //case scheduler::CHAR_TYPE: return fun(*element.implicit_vector_char);
+  //case scheduler::UCHAR_TYPE: return fun(*element.implicit_vector_uchar);
+  //case scheduler::SHORT_TYPE: return fun(*element.implicit_vector_short);
+  //case scheduler::USHORT_TYPE: return fun(*element.implicit_vector_ushort);
+  case scheduler::INT_TYPE:  return fun(*element.implicit_vector_int);
+  case scheduler::UINT_TYPE: return fun(*element.implicit_vector_uint);
+  case scheduler::LONG_TYPE:  return fun(*element.implicit_vector_long);
+  case scheduler::ULONG_TYPE: return fun(*element.implicit_vector_ulong);
+  case scheduler::FLOAT_TYPE : return fun(*element.implicit_vector_float);
+  case scheduler::DOUBLE_TYPE : return fun(*element.implicit_vector_double);
+  default : throw generator_not_supported_exception("Unsupported Scalartype");
+  }
+}
+
+template<class Fun>
+static typename Fun::result_type call_on_matrix(scheduler::lhs_rhs_element element, Fun const & fun){
+  assert(element.type_family == scheduler::MATRIX_TYPE_FAMILY && bool("Must be called on a matrix"));
+  switch (element.numeric_type)
+  {
+  //case scheduler::CHAR_TYPE: return fun(*element.matrix_char);
+  //case scheduler::UCHAR_TYPE: return fun(*element.matrix_uchar);
+  //case scheduler::SHORT_TYPE: return fun(*element.matrix_short);
+  //case scheduler::USHORT_TYPE: return fun(*element.matrix_ushort);
+  case scheduler::INT_TYPE:  return fun(*element.matrix_int);
+  case scheduler::UINT_TYPE: return fun(*element.matrix_uint);
+  case scheduler::LONG_TYPE:  return fun(*element.matrix_long);
+  case scheduler::ULONG_TYPE: return fun(*element.matrix_ulong);
+  case scheduler::FLOAT_TYPE : return fun(*element.matrix_float);
+  case scheduler::DOUBLE_TYPE : return fun(*element.matrix_double);
+  default : throw generator_not_supported_exception("Unsupported Scalartype");
+  }
+}
+
+
+template<class Fun>
+static typename Fun::result_type call_on_implicit_matrix(scheduler::lhs_rhs_element element, Fun const & fun){
+  assert(element.subtype     == scheduler::IMPLICIT_MATRIX_TYPE && bool("Must be called on a implicit matrix"));
+  switch (element.numeric_type)
+  {
+  //case scheduler::CHAR_TYPE: return fun(*element.implicit_matrix_char);
+  //case scheduler::UCHAR_TYPE: return fun(*element.implicit_matrix_uchar);
+  //case scheduler::SHORT_TYPE: return fun(*element.implicit_matrix_short);
+  //case scheduler::USHORT_TYPE: return fun(*element.implicit_matrix_ushort);
+  case scheduler::INT_TYPE:  return fun(*element.implicit_matrix_int);
+  case scheduler::UINT_TYPE: return fun(*element.implicit_matrix_uint);
+  case scheduler::LONG_TYPE:  return fun(*element.implicit_matrix_long);
+  case scheduler::ULONG_TYPE: return fun(*element.implicit_matrix_ulong);
+  case scheduler::FLOAT_TYPE : return fun(*element.implicit_matrix_float);
+  case scheduler::DOUBLE_TYPE : return fun(*element.implicit_matrix_double);
+  default : throw generator_not_supported_exception("Unsupported Scalartype");
+  }
+}
+
+template<class Fun>
+static typename Fun::result_type call_on_element(scheduler::lhs_rhs_element const & element, Fun const & fun){
+  switch (element.type_family)
+  {
+  case scheduler::SCALAR_TYPE_FAMILY:
+    if (element.subtype == scheduler::HOST_SCALAR_TYPE)
+      return call_on_host_scalar(element, fun);
+    else
+      return call_on_scalar(element, fun);
+  case scheduler::VECTOR_TYPE_FAMILY :
+    if (element.subtype == scheduler::IMPLICIT_VECTOR_TYPE)
+      return call_on_implicit_vector(element, fun);
+    else
+      return call_on_vector(element, fun);
+  case scheduler::MATRIX_TYPE_FAMILY:
+    if (element.subtype == scheduler::IMPLICIT_MATRIX_TYPE)
+      return call_on_implicit_matrix(element, fun);
+    else
+      return call_on_matrix(element,fun);
+  default:
+    throw generator_not_supported_exception("Unsupported datastructure type : Not among {Scalar, Vector, Matrix}");
+  }
+}
+
+struct scalartype_size_fun
+{
+  typedef vcl_size_t result_type;
+  result_type operator()(float const &) const { return sizeof(float); }
+  result_type operator()(double const &) const { return sizeof(double); }
+  template<class T> result_type operator()(T const &) const { return sizeof(typename viennacl::result_of::cpu_value_type<T>::type); }
+};
+
+struct internal_size_fun
+{
+  typedef vcl_size_t result_type;
+  template<class T> result_type operator()(T const &t) const { return viennacl::traits::internal_size(t); }
+};
+
+struct size_fun
+{
+  typedef vcl_size_t result_type;
+  template<class T> result_type operator()(T const &t) const { return viennacl::traits::size(t); }
+};
+
+struct stride_fun
+{
+  typedef vcl_size_t result_type;
+  template<class T> result_type operator()(T const &t) const { return viennacl::traits::stride(t); }
+};
+
+struct start1_fun
+{
+  typedef vcl_size_t result_type;
+  template<class T> result_type operator()(T const &t) const { return viennacl::traits::start1(t); }
+};
+
+struct start2_fun
+{
+  typedef vcl_size_t result_type;
+  template<class T> result_type operator()(T const &t) const { return viennacl::traits::start2(t); }
+};
+
+struct leading_stride
+{
+  typedef vcl_size_t result_type;
+  template<class T> result_type operator()(T const &t) const { return viennacl::traits::row_major(t)?viennacl::traits::stride2(t):viennacl::traits::stride1(t); }
+};
+
+struct leading_start
+{
+  typedef vcl_size_t result_type;
+  template<class T> result_type operator()(T const &t) const { return viennacl::traits::row_major(t)?viennacl::traits::start2(t):viennacl::traits::start1(t); }
+};
+
+struct stride1_fun
+{
+  typedef vcl_size_t result_type;
+  template<class T> result_type operator()(T const &t) const { return viennacl::traits::stride1(t); }
+};
+
+struct stride2_fun
+{
+  typedef vcl_size_t result_type;
+  template<class T> result_type operator()(T const &t) const { return viennacl::traits::stride2(t); }
+};
+
+struct handle_fun
+{
+  typedef cl_mem result_type;
+  template<class T>
+  result_type operator()(T const &t) const { return viennacl::traits::opencl_handle(t); }
+};
+
+struct internal_size1_fun
+{
+  typedef vcl_size_t result_type;
+  template<class T>
+  result_type operator()(T const &t) const { return viennacl::traits::internal_size1(t); }
+};
+
+struct row_major_fun
+{
+  typedef bool result_type;
+  template<class T>
+  result_type operator()(T const &t) const { return viennacl::traits::row_major(t); }
+};
+
+struct internal_size2_fun
+{
+  typedef vcl_size_t result_type;
+  template<class T>
+  result_type operator()(T const &t) const { return viennacl::traits::internal_size2(t); }
+};
+
+struct size1_fun
+{
+  typedef vcl_size_t result_type;
+  template<class T>
+  result_type operator()(T const &t) const { return viennacl::traits::size1(t); }
+};
+
+struct size2_fun
+{
+  typedef vcl_size_t result_type;
+  template<class T>
+  result_type operator()(T const &t) const { return viennacl::traits::size2(t); }
+};
+
+template<class T, class U>
+struct is_same_type { enum { value = 0 }; };
+
+template<class T>
+struct is_same_type<T,T> { enum { value = 1 }; };
+
+inline bool is_reduction(scheduler::statement_node const & node)
+{
+  return node.op.type_family==scheduler::OPERATION_VECTOR_REDUCTION_TYPE_FAMILY
+      || node.op.type_family==scheduler::OPERATION_COLUMNS_REDUCTION_TYPE_FAMILY
+      || node.op.type_family==scheduler::OPERATION_ROWS_REDUCTION_TYPE_FAMILY
+      || node.op.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE
+      || node.op.type==scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE;
+}
+
+inline bool is_index_reduction(scheduler::op_element const & op)
+{
+  return op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE
+      || op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGMAX_TYPE
+      || op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE
+      || op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGMIN_TYPE;
+}
+template<class T>
+struct type_to_string;
+template<> struct type_to_string<unsigned char> { static const char * value() { return "uchar"; } };
+template<> struct type_to_string<char> { static const char * value() { return "char"; } };
+template<> struct type_to_string<unsigned short> { static const char * value() { return "ushort"; } };
+template<> struct type_to_string<short> { static const char * value() { return "short"; } };
+template<> struct type_to_string<unsigned int> { static const char * value() { return "uint"; } };
+template<> struct type_to_string<int> { static const char * value() { return "int"; } };
+template<> struct type_to_string<unsigned long> { static const char * value() { return "ulong"; } };
+template<> struct type_to_string<long> { static const char * value() { return "long"; } };
+template<> struct type_to_string<float> { static const char * value() { return "float"; } };
+template<> struct type_to_string<double> { static const char * value() { return "double"; } };
+
+
+template<class T>
+struct first_letter_of_type;
+template<> struct first_letter_of_type<char> { static char value() { return 'c'; } };
+template<> struct first_letter_of_type<unsigned char> { static char value() { return 'd'; } };
+template<> struct first_letter_of_type<short> { static char value() { return 's'; } };
+template<> struct first_letter_of_type<unsigned short> { static char value() { return 't'; } };
+template<> struct first_letter_of_type<int> { static char value() { return 'i'; } };
+template<> struct first_letter_of_type<unsigned int> { static char value() { return 'j'; } };
+template<> struct first_letter_of_type<long> { static char value() { return 'l'; } };
+template<> struct first_letter_of_type<unsigned long> { static char value() { return 'm'; } };
+template<> struct first_letter_of_type<float> { static char value() { return 'f'; } };
+template<> struct first_letter_of_type<double> { static char value() { return 'd'; } };
+
+class kernel_generation_stream : public std::ostream
+{
+  class kgenstream : public std::stringbuf
+  {
+  public:
+    kgenstream(std::ostringstream& osstream,unsigned int const & tab_count) : oss_(osstream), tab_count_(tab_count){ }
+    int sync() {
+      for (unsigned int i=0; i<tab_count_;++i)
+        oss_ << "    ";
+      oss_ << str();
+      str("");
+      return !oss_;
+    }
+#if defined(_MSC_VER)
+    ~kgenstream() throw() {  pubsync(); }
+#else
+    ~kgenstream() {  pubsync(); }
+#endif
+  private:
+    std::ostream& oss_;
+    unsigned int const & tab_count_;
+  };
+
+public:
+  kernel_generation_stream() : std::ostream(new kgenstream(oss,tab_count_)), tab_count_(0){ }
+#if defined(_MSC_VER)
+  ~kernel_generation_stream() throw() { delete rdbuf(); }
+#else
+  ~kernel_generation_stream(){ delete rdbuf(); }
+#endif
+
+  std::string str(){ return oss.str(); }
+  void inc_tab(){ ++tab_count_; }
+  void dec_tab(){ --tab_count_; }
+private:
+  unsigned int tab_count_;
+  std::ostringstream oss;
+};
+
+inline bool node_leaf(scheduler::op_element const & op)
+{
+  using namespace scheduler;
+  return op.type==OPERATION_UNARY_NORM_1_TYPE
+      || op.type==OPERATION_UNARY_NORM_2_TYPE
+      || op.type==OPERATION_UNARY_NORM_INF_TYPE
+      || op.type==OPERATION_UNARY_TRANS_TYPE
+      || op.type==OPERATION_BINARY_MAT_VEC_PROD_TYPE
+      || op.type==OPERATION_BINARY_MAT_MAT_PROD_TYPE
+      || op.type==OPERATION_BINARY_INNER_PROD_TYPE
+      || op.type==OPERATION_BINARY_MATRIX_DIAG_TYPE
+      || op.type==OPERATION_BINARY_VECTOR_DIAG_TYPE
+      || op.type==OPERATION_BINARY_MATRIX_ROW_TYPE
+      || op.type==OPERATION_BINARY_MATRIX_COLUMN_TYPE
+      || op.type_family==OPERATION_VECTOR_REDUCTION_TYPE_FAMILY
+      || op.type_family==OPERATION_ROWS_REDUCTION_TYPE_FAMILY
+      || op.type_family==OPERATION_COLUMNS_REDUCTION_TYPE_FAMILY;
+}
+
+inline bool elementwise_operator(scheduler::op_element const & op)
+{
+  using namespace scheduler;
+  return op.type== OPERATION_BINARY_ASSIGN_TYPE
+      || op.type== OPERATION_BINARY_INPLACE_ADD_TYPE
+      || op.type== OPERATION_BINARY_INPLACE_SUB_TYPE
+      || op.type== OPERATION_BINARY_ADD_TYPE
+      || op.type== OPERATION_BINARY_SUB_TYPE
+      || op.type== OPERATION_BINARY_ELEMENT_PROD_TYPE
+      || op.type== OPERATION_BINARY_ELEMENT_DIV_TYPE
+      || op.type== OPERATION_BINARY_MULT_TYPE
+      || op.type== OPERATION_BINARY_DIV_TYPE;
+}
+
+inline bool elementwise_function(scheduler::op_element const & op)
+{
+  using namespace scheduler;
+  return
+
+      op.type == OPERATION_UNARY_CAST_CHAR_TYPE
+      || op.type == OPERATION_UNARY_CAST_UCHAR_TYPE
+      || op.type == OPERATION_UNARY_CAST_SHORT_TYPE
+      || op.type == OPERATION_UNARY_CAST_USHORT_TYPE
+      || op.type == OPERATION_UNARY_CAST_INT_TYPE
+      || op.type == OPERATION_UNARY_CAST_UINT_TYPE
+      || op.type == OPERATION_UNARY_CAST_LONG_TYPE
+      || op.type == OPERATION_UNARY_CAST_ULONG_TYPE
+      || op.type == OPERATION_UNARY_CAST_HALF_TYPE
+      || op.type == OPERATION_UNARY_CAST_FLOAT_TYPE
+      || op.type == OPERATION_UNARY_CAST_DOUBLE_TYPE
+
+      || op.type== OPERATION_UNARY_ABS_TYPE
+      || op.type== OPERATION_UNARY_ACOS_TYPE
+      || op.type== OPERATION_UNARY_ASIN_TYPE
+      || op.type== OPERATION_UNARY_ATAN_TYPE
+      || op.type== OPERATION_UNARY_CEIL_TYPE
+      || op.type== OPERATION_UNARY_COS_TYPE
+      || op.type== OPERATION_UNARY_COSH_TYPE
+      || op.type== OPERATION_UNARY_EXP_TYPE
+      || op.type== OPERATION_UNARY_FABS_TYPE
+      || op.type== OPERATION_UNARY_FLOOR_TYPE
+      || op.type== OPERATION_UNARY_LOG_TYPE
+      || op.type== OPERATION_UNARY_LOG10_TYPE
+      || op.type== OPERATION_UNARY_SIN_TYPE
+      || op.type== OPERATION_UNARY_SINH_TYPE
+      || op.type== OPERATION_UNARY_SQRT_TYPE
+      || op.type== OPERATION_UNARY_TAN_TYPE
+      || op.type== OPERATION_UNARY_TANH_TYPE
+
+      || op.type== OPERATION_BINARY_ELEMENT_POW_TYPE
+      || op.type== OPERATION_BINARY_ELEMENT_EQ_TYPE
+      || op.type== OPERATION_BINARY_ELEMENT_NEQ_TYPE
+      || op.type== OPERATION_BINARY_ELEMENT_GREATER_TYPE
+      || op.type== OPERATION_BINARY_ELEMENT_LESS_TYPE
+      || op.type== OPERATION_BINARY_ELEMENT_GEQ_TYPE
+      || op.type== OPERATION_BINARY_ELEMENT_LEQ_TYPE
+      || op.type== OPERATION_BINARY_ELEMENT_FMAX_TYPE
+      || op.type== OPERATION_BINARY_ELEMENT_FMIN_TYPE
+      || op.type== OPERATION_BINARY_ELEMENT_MAX_TYPE
+      || op.type== OPERATION_BINARY_ELEMENT_MIN_TYPE;
+
+}
+
+inline scheduler::lhs_rhs_element & lhs_rhs_element(scheduler::statement const & st, vcl_size_t idx, leaf_t leaf)
+{
+  using namespace tree_parsing;
+  assert(leaf==LHS_NODE_TYPE || leaf==RHS_NODE_TYPE);
+  if (leaf==LHS_NODE_TYPE)
+    return const_cast<scheduler::lhs_rhs_element &>(st.array()[idx].lhs);
+  return const_cast<scheduler::lhs_rhs_element &>(st.array()[idx].rhs);
+}
+
+inline unsigned int size_of(scheduler::statement_node_numeric_type type)
+{
+  using namespace scheduler;
+  switch (type)
+  {
+  case UCHAR_TYPE:
+  case CHAR_TYPE: return 1;
+
+  case USHORT_TYPE:
+  case SHORT_TYPE:
+  case HALF_TYPE: return 2;
+
+  case UINT_TYPE:
+  case INT_TYPE:
+  case FLOAT_TYPE: return 4;
+
+  case ULONG_TYPE:
+  case LONG_TYPE:
+  case DOUBLE_TYPE: return 8;
+
+  default: throw generator_not_supported_exception("Unsupported scalartype");
+  }
+}
+
+inline std::string append_width(std::string const & str, unsigned int width)
+{
+  if (width==1)
+    return str;
+  return str + tools::to_string(width);
+}
+
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/ell_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/ell_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/ell_matrix.hpp
new file mode 100644
index 0000000..3c3a428
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/ell_matrix.hpp
@@ -0,0 +1,362 @@
+#ifndef VIENNACL_ELL_MATRIX_HPP_
+#define VIENNACL_ELL_MATRIX_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/ell_matrix.hpp
+    @brief Implementation of the ell_matrix class
+
+    Contributed by Volodymyr Kysenko.
+*/
+
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+namespace viennacl
+{
+/** @brief Sparse matrix class using the ELLPACK format for storing the nonzeros.
+    *
+    * This format works best for matrices where the number of nonzeros per row is mostly the same.
+    * Finite element and finite difference methods on nicely shaped domains often result in such a nonzero pattern.
+    * For a matrix
+    *
+    *   (1 2 0 0 0)
+    *   (2 3 4 0 0)
+    *   (0 5 6 0 7)
+    *   (0 0 8 9 0)
+    *
+    * the entries are layed out in chunks of size 3 as
+    *   (1 2 5 8; 2 3 6 9; 0 4 7 0)
+    * Note that this is a 'transposed' representation in order to maximize coalesced memory access.
+    */
+template<typename NumericT, unsigned int AlignmentV /* see forwards.h for default argument */>
+class ell_matrix
+{
+public:
+  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
+  typedef vcl_size_t                                                                                 size_type;
+
+  ell_matrix() : rows_(0), cols_(0), maxnnz_(0) {}
+
+  ell_matrix(viennacl::context ctx) : rows_(0), cols_(0), maxnnz_(0)
+  {
+    coords_.switch_active_handle_id(ctx.memory_type());
+    elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+    if (ctx.memory_type() == OPENCL_MEMORY)
+    {
+      coords_.opencl_handle().context(ctx.opencl_context());
+      elements_.opencl_handle().context(ctx.opencl_context());
+    }
+#endif
+  }
+
+  /** @brief Resets all entries in the matrix back to zero without changing the matrix size. Resets the sparsity pattern. */
+  void clear()
+  {
+    maxnnz_ = 0;
+
+    viennacl::backend::typesafe_host_array<unsigned int> host_coords_buffer(coords_, internal_size1());
+    std::vector<NumericT> host_elements(internal_size1());
+
+    viennacl::backend::memory_create(coords_,   host_coords_buffer.element_size() * internal_size1(), viennacl::traits::context(coords_),   host_coords_buffer.get());
+    viennacl::backend::memory_create(elements_, sizeof(NumericT) * internal_size1(),                  viennacl::traits::context(elements_), &(host_elements[0]));
+  }
+
+  vcl_size_t internal_size1() const { return viennacl::tools::align_to_multiple<vcl_size_t>(rows_, AlignmentV); }
+  vcl_size_t internal_size2() const { return viennacl::tools::align_to_multiple<vcl_size_t>(cols_, AlignmentV); }
+
+  vcl_size_t size1() const { return rows_; }
+  vcl_size_t size2() const { return cols_; }
+
+  vcl_size_t internal_maxnnz() const {return viennacl::tools::align_to_multiple<vcl_size_t>(maxnnz_, AlignmentV); }
+  vcl_size_t maxnnz() const { return maxnnz_; }
+
+  vcl_size_t nnz() const { return rows_ * maxnnz_; }
+  vcl_size_t internal_nnz() const { return internal_size1() * internal_maxnnz(); }
+
+  handle_type & handle()       { return elements_; }
+  const handle_type & handle() const { return elements_; }
+
+  handle_type & handle2()       { return coords_; }
+  const handle_type & handle2() const { return coords_; }
+
+#if defined(_MSC_VER) && _MSC_VER < 1500          //Visual Studio 2005 needs special treatment
+  template<typename CPUMatrixT>
+  friend void copy(const CPUMatrixT & cpu_matrix, ell_matrix & gpu_matrix );
+#else
+  template<typename CPUMatrixT, typename T, unsigned int ALIGN>
+  friend void copy(const CPUMatrixT & cpu_matrix, ell_matrix<T, ALIGN> & gpu_matrix );
+#endif
+
+private:
+  vcl_size_t rows_;
+  vcl_size_t cols_;
+  vcl_size_t maxnnz_;
+
+  handle_type coords_;
+  handle_type elements_;
+};
+
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const CPUMatrixT& cpu_matrix, ell_matrix<NumericT, AlignmentV>& gpu_matrix )
+{
+  assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  if (cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0)
+  {
+    //determine max capacity for row
+    vcl_size_t max_entries_per_row = 0;
+    for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+    {
+      vcl_size_t num_entries = 0;
+      for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+        ++num_entries;
+
+      max_entries_per_row = std::max(max_entries_per_row, num_entries);
+    }
+
+    //setup GPU matrix
+    gpu_matrix.maxnnz_ = max_entries_per_row;
+    gpu_matrix.rows_ = cpu_matrix.size1();
+    gpu_matrix.cols_ = cpu_matrix.size2();
+
+    vcl_size_t nnz = gpu_matrix.internal_nnz();
+
+    viennacl::backend::typesafe_host_array<unsigned int> coords(gpu_matrix.handle2(), nnz);
+    std::vector<NumericT> elements(nnz, 0);
+
+    // std::cout << "ELL_MATRIX copy " << gpu_matrix.maxnnz_ << " " << gpu_matrix.rows_ << " " << gpu_matrix.cols_ << " "
+    //             << gpu_matrix.internal_maxnnz() << "\n";
+
+    for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+    {
+      vcl_size_t data_index = 0;
+
+      for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+      {
+        coords.set(gpu_matrix.internal_size1() * data_index + col_it.index1(), col_it.index2());
+        elements[gpu_matrix.internal_size1() * data_index + col_it.index1()] = *col_it;
+        //std::cout << *col_it << "\n";
+        data_index++;
+      }
+    }
+
+    viennacl::backend::memory_create(gpu_matrix.handle2(), coords.raw_size(),                   traits::context(gpu_matrix.handle2()), coords.get());
+    viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(NumericT) * elements.size(), traits::context(gpu_matrix.handle()), &(elements[0]));
+  }
+}
+
+
+
+/** @brief Copies a sparse matrix from the host to the compute device. The host type is the std::vector< std::map < > > format .
+  *
+  * @param cpu_matrix   A sparse matrix on the host composed of an STL vector and an STL map.
+  * @param gpu_matrix   The sparse ell_matrix from ViennaCL
+  */
+template<typename IndexT, typename NumericT, unsigned int AlignmentV>
+void copy(std::vector< std::map<IndexT, NumericT> > const & cpu_matrix,
+          ell_matrix<NumericT, AlignmentV> & gpu_matrix)
+{
+  vcl_size_t max_col = 0;
+  for (vcl_size_t i=0; i<cpu_matrix.size(); ++i)
+  {
+    if (cpu_matrix[i].size() > 0)
+      max_col = std::max<vcl_size_t>(max_col, (cpu_matrix[i].rbegin())->first);
+  }
+
+  viennacl::copy(tools::const_sparse_matrix_adapter<NumericT, IndexT>(cpu_matrix, cpu_matrix.size(), max_col + 1), gpu_matrix);
+}
+
+
+
+
+
+
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const ell_matrix<NumericT, AlignmentV>& gpu_matrix, CPUMatrixT& cpu_matrix)
+{
+  assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  if (gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0)
+  {
+    std::vector<NumericT> elements(gpu_matrix.internal_nnz());
+    viennacl::backend::typesafe_host_array<unsigned int> coords(gpu_matrix.handle2(), gpu_matrix.internal_nnz());
+
+    viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(NumericT) * elements.size(), &(elements[0]));
+    viennacl::backend::memory_read(gpu_matrix.handle2(), 0, coords.raw_size(), coords.get());
+
+    for (vcl_size_t row = 0; row < gpu_matrix.size1(); row++)
+    {
+      for (vcl_size_t ind = 0; ind < gpu_matrix.internal_maxnnz(); ind++)
+      {
+        vcl_size_t offset = gpu_matrix.internal_size1() * ind + row;
+
+        NumericT val = elements[offset];
+        if (val <= 0 && val >= 0) // val == 0 without compiler warnings
+          continue;
+
+        if (coords[offset] >= gpu_matrix.size2())
+        {
+          std::cerr << "ViennaCL encountered invalid data " << offset << " " << ind << " " << row << " " << coords[offset] << " " << gpu_matrix.size2() << std::endl;
+          return;
+        }
+
+        cpu_matrix(row, coords[offset]) = val;
+      }
+    }
+  }
+}
+
+
+/** @brief Copies a sparse matrix from the compute device to the host. The host type is the std::vector< std::map < > > format .
+  *
+  * @param gpu_matrix   The sparse ell_matrix from ViennaCL
+  * @param cpu_matrix   A sparse matrix on the host composed of an STL vector and an STL map.
+  */
+template<typename NumericT, unsigned int AlignmentV, typename IndexT>
+void copy(const ell_matrix<NumericT, AlignmentV> & gpu_matrix,
+          std::vector< std::map<IndexT, NumericT> > & cpu_matrix)
+{
+  if (cpu_matrix.size() == 0)
+    cpu_matrix.resize(gpu_matrix.size1());
+
+  assert(cpu_matrix.size() == gpu_matrix.size1() && bool("Matrix dimension mismatch!"));
+
+  tools::sparse_matrix_adapter<NumericT, IndexT> temp(cpu_matrix, gpu_matrix.size1(), gpu_matrix.size2());
+  viennacl::copy(gpu_matrix, temp);
+}
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+  // x = A * y
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x = A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs = temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(0));
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x += A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs += temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(1));
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x -= A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs -= temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(-1), lhs, T(1));
+    }
+  };
+
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs += temp_result;
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs -= temp_result;
+    }
+  };
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+
+#endif
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/fft.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/fft.hpp b/native-viennaCL/src/main/cpp/viennacl/fft.hpp
new file mode 100644
index 0000000..bacd911
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/fft.hpp
@@ -0,0 +1,282 @@
+#ifndef VIENNACL_FFT_HPP
+#define VIENNACL_FFT_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+   Institute for Analysis and Scientific Computing,
+   TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+   -----------------
+   ViennaCL - The Vienna Computing Library
+   -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+
+/** @file viennacl/fft.hpp
+ @brief All routines related to the Fast Fourier Transform. Experimental.
+ */
+
+#include <viennacl/vector.hpp>
+#include <viennacl/matrix.hpp>
+
+#include "viennacl/linalg/fft_operations.hpp"
+#include "viennacl/traits/handle.hpp"
+
+#include <cmath>
+
+#include <stdexcept>
+/// @cond
+namespace viennacl
+{
+namespace detail
+{
+namespace fft
+{
+  inline bool is_radix2(vcl_size_t data_size)
+  {
+    return !((data_size > 2) && (data_size & (data_size - 1)));
+  }
+} //namespace fft
+} //namespace detail
+
+/**
+ * @brief Generic inplace version of 1-D Fourier transformation.
+ *
+ * @param input       Input vector, result will be stored here.
+ * @param batch_num   Number of items in batch
+ * @param sign        Sign of exponent, default is -1.0
+ */
+template<class NumericT, unsigned int AlignmentV>
+void inplace_fft(viennacl::vector<NumericT, AlignmentV>& input, vcl_size_t batch_num = 1,
+                 NumericT sign = -1.0)
+{
+  vcl_size_t size = (input.size() >> 1) / batch_num;
+
+  if (!viennacl::detail::fft::is_radix2(size))
+  {
+    viennacl::vector<NumericT, AlignmentV> output(input.size());
+    viennacl::linalg::direct(input, output, size, size, batch_num, sign);
+    viennacl::copy(output, input);
+  }
+  else
+    viennacl::linalg::radix2(input, size, size, batch_num, sign);
+}
+
+/**
+ * @brief Generic version of 1-D Fourier transformation.
+ *
+ * @param input      Input vector.
+ * @param output     Output vector.
+ * @param batch_num  Number of items in batch.
+ * @param sign       Sign of exponent, default is -1.0
+ */
+template<class NumericT, unsigned int AlignmentV>
+void fft(viennacl::vector<NumericT, AlignmentV>& input,
+         viennacl::vector<NumericT, AlignmentV>& output, vcl_size_t batch_num = 1, NumericT sign = -1.0)
+{
+  vcl_size_t size = (input.size() >> 1) / batch_num;
+  if (viennacl::detail::fft::is_radix2(size))
+  {
+    viennacl::copy(input, output);
+    viennacl::linalg::radix2(output, size, size, batch_num, sign);
+  }
+  else
+    viennacl::linalg::direct(input, output, size, size, batch_num, sign);
+}
+
+/**
+ * @brief Generic inplace version of 2-D Fourier transformation.
+ *
+ * @param input       Input matrix, result will be stored here.
+ * @param sign        Sign of exponent, default is -1.0
+ */
+template<class NumericT, unsigned int AlignmentV>
+void inplace_fft(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& input,
+                 NumericT sign = -1.0)
+{
+  vcl_size_t rows_num = input.size1();
+  vcl_size_t cols_num = input.size2() >> 1;
+
+  vcl_size_t cols_int = input.internal_size2() >> 1;
+
+  // batch with rows
+  if (viennacl::detail::fft::is_radix2(cols_num))
+    viennacl::linalg::radix2(input, cols_num, cols_int, rows_num, sign,
+                             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR);
+  else
+  {
+    viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> output(input.size1(),
+                                                                       input.size2());
+
+    viennacl::linalg::direct(input, output, cols_num, cols_int, rows_num, sign,
+                             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR);
+
+    input = output;
+  }
+
+  // batch with cols
+  if (viennacl::detail::fft::is_radix2(rows_num))
+    viennacl::linalg::radix2(input, rows_num, cols_int, cols_num, sign,
+                             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR);
+  else
+  {
+    viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> output(input.size1(),
+                                                                       input.size2());
+
+    viennacl::linalg::direct(input, output, rows_num, cols_int, cols_num, sign,
+                             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR);
+
+    input = output;
+  }
+
+}
+
+/**
+ * @brief Generic version of 2-D Fourier transformation.
+ *
+ * @param input      Input vector.
+ * @param output     Output vector.
+ * @param sign       Sign of exponent, default is -1.0
+ */
+template<class NumericT, unsigned int AlignmentV>
+void fft(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& input, //TODO
+         viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& output, NumericT sign = -1.0)
+{
+
+  vcl_size_t rows_num = input.size1();
+  vcl_size_t cols_num = input.size2() >> 1;
+  vcl_size_t cols_int = input.internal_size2() >> 1;
+
+  // batch with rows
+  if (viennacl::detail::fft::is_radix2(cols_num))
+  {
+    output = input;
+    viennacl::linalg::radix2(output, cols_num, cols_int, rows_num, sign,
+                             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR);
+  }
+  else
+    viennacl::linalg::direct(input, output, cols_num, cols_int, rows_num, sign,
+                             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR);
+
+  // batch with cols
+  if (viennacl::detail::fft::is_radix2(rows_num))
+  {
+    //std::cout<<"output"<<output<<std::endl;
+
+    viennacl::linalg::radix2(output, rows_num, cols_int, cols_num, sign,
+                             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR);
+  }
+  else
+  {
+    viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> tmp(output.size1(),
+                                                                    output.size2());
+    tmp = output;
+    //std::cout<<"tmp"<<tmp<<std::endl;
+    viennacl::linalg::direct(tmp, output, rows_num, cols_int, cols_num, sign,
+                             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR);
+  }
+}
+
+/**
+ * @brief Generic inplace version of inverse 1-D Fourier transformation.
+ *
+ * Shorthand function for fft(sign = 1.0)
+ *
+ * @param input      Input vector.
+ * @param batch_num  Number of items in batch.
+ * @param sign       Sign of exponent, default is -1.0
+ */
+template<class NumericT, unsigned int AlignmentV>
+void inplace_ifft(viennacl::vector<NumericT, AlignmentV>& input, vcl_size_t batch_num = 1)
+{
+  viennacl::inplace_fft(input, batch_num, NumericT(1.0));
+  viennacl::linalg::normalize(input);
+}
+
+/**
+ * @brief Generic version of inverse 1-D Fourier transformation.
+ *
+ * Shorthand function for fft(sign = 1.0)
+ *
+ * @param input      Input vector.
+ * @param output     Output vector.
+ * @param batch_num  Number of items in batch.
+ * @param sign       Sign of exponent, default is -1.0
+ */
+template<class NumericT, unsigned int AlignmentV>
+void ifft(viennacl::vector<NumericT, AlignmentV>& input,
+          viennacl::vector<NumericT, AlignmentV>& output, vcl_size_t batch_num = 1)
+{
+  viennacl::fft(input, output, batch_num, NumericT(1.0));
+  viennacl::linalg::normalize(output);
+}
+
+namespace linalg
+{
+  /**
+   * @brief 1-D convolution of two vectors.
+   *
+   * This function does not make any changes to input vectors
+   *
+   * @param input1     Input vector #1.
+   * @param input2     Input vector #2.
+   * @param output     Output vector.
+   */
+  template<class NumericT, unsigned int AlignmentV>
+  void convolve(viennacl::vector<NumericT, AlignmentV>& input1,
+                viennacl::vector<NumericT, AlignmentV>& input2,
+                viennacl::vector<NumericT, AlignmentV>& output)
+  {
+    assert(input1.size() == input2.size());
+    assert(input1.size() == output.size());
+    //temporal arrays
+    viennacl::vector<NumericT, AlignmentV> tmp1(input1.size());
+    viennacl::vector<NumericT, AlignmentV> tmp2(input2.size());
+    viennacl::vector<NumericT, AlignmentV> tmp3(output.size());
+
+    // align input arrays to equal size
+    // FFT of input data
+    viennacl::fft(input1, tmp1);
+    viennacl::fft(input2, tmp2);
+
+    // multiplication of input data
+    viennacl::linalg::multiply_complex(tmp1, tmp2, tmp3);
+    // inverse FFT of input data
+    viennacl::ifft(tmp3, output);
+  }
+
+  /**
+   * @brief 1-D convolution of two vectors.
+   *
+   * This function can make changes to input vectors to avoid additional memory allocations.
+   *
+   * @param input1     Input vector #1.
+   * @param input2     Input vector #2.
+   * @param output     Output vector.
+   */
+  template<class NumericT, unsigned int AlignmentV>
+  void convolve_i(viennacl::vector<NumericT, AlignmentV>& input1,
+                  viennacl::vector<NumericT, AlignmentV>& input2,
+                  viennacl::vector<NumericT, AlignmentV>& output)
+  {
+    assert(input1.size() == input2.size());
+    assert(input1.size() == output.size());
+
+    viennacl::inplace_fft(input1);
+    viennacl::inplace_fft(input2);
+
+    viennacl::linalg::multiply_complex(input1, input2, output);
+
+    viennacl::inplace_ifft(output);
+  }
+}      //namespace linalg
+}      //namespace viennacl
+
+/// @endcond
+#endif


[09/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/coordinate_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/coordinate_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/coordinate_matrix.hpp
new file mode 100644
index 0000000..2f67a5b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/coordinate_matrix.hpp
@@ -0,0 +1,405 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/coordinate_matrix.hpp
+ *  @brief OpenCL kernel file for coordinate_matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_coordinate_matrix_vec_mul(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void vec_mul( \n");
+  source.append("  __global const uint2 * coords,  \n");//(row_index, column_index)
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const uint  * group_boundaries, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("  uint4 layout_x, \n");
+  source.append("  "); source.append(numeric_string); source.append(" alpha, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  uint4 layout_result, \n");
+  source.append("  "); source.append(numeric_string); source.append(" beta, \n");
+  source.append("  __local unsigned int * shared_rows, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * inter_results) \n");
+  source.append("{ \n");
+  source.append("  uint2 tmp; \n");
+  source.append("  "); source.append(numeric_string); source.append(" val; \n");
+  source.append("  uint group_start = group_boundaries[get_group_id(0)]; \n");
+  source.append("  uint group_end   = group_boundaries[get_group_id(0) + 1]; \n");
+  source.append("  uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");   // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
+
+  source.append("  uint local_index = 0; \n");
+
+  source.append("  for (uint k = 0; k < k_end; ++k) { \n");
+  source.append("    local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
+
+  source.append("    tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
+  source.append("    val = (local_index < group_end) ? elements[local_index] * x[tmp.y * layout_x.y + layout_x.x] : 0; \n");
+
+  //check for carry from previous loop run:
+  source.append("    if (get_local_id(0) == 0 && k > 0) { \n");
+  source.append("      if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
+  source.append("        val += inter_results[get_local_size(0)-1]; \n");
+  source.append("      else if (beta != 0) \n");
+  source.append("        result[shared_rows[get_local_size(0)-1] * layout_result.y + layout_result.x] += alpha * inter_results[get_local_size(0)-1]; \n");
+  source.append("      else \n");
+  source.append("        result[shared_rows[get_local_size(0)-1] * layout_result.y + layout_result.x]  = alpha * inter_results[get_local_size(0)-1]; \n");
+  source.append("    } \n");
+
+  //segmented parallel reduction begin
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    shared_rows[get_local_id(0)] = tmp.x; \n");
+  source.append("    inter_results[get_local_id(0)] = val; \n");
+  source.append("    "); source.append(numeric_string); source.append(" left = 0; \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+  source.append("      left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("      inter_results[get_local_id(0)] += left; \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    } \n");
+  //segmented parallel reduction end
+
+  source.append("    if (local_index < group_end - 1 && get_local_id(0) < get_local_size(0) - 1 && \n");
+  source.append("      shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
+  source.append("      if (beta != 0) result[tmp.x * layout_result.y + layout_result.x] += alpha * inter_results[get_local_id(0)]; \n");
+  source.append("      else           result[tmp.x * layout_result.y + layout_result.x]  = alpha * inter_results[get_local_id(0)]; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("  }  \n"); //for k
+
+  source.append("  if (local_index + 1 == group_end) {\n");  //write results of last active entry (this may not necessarily be the case already)
+  source.append("    if (beta != 0) result[tmp.x * layout_result.y + layout_result.x] += alpha * inter_results[get_local_id(0)]; \n");
+  source.append("    else           result[tmp.x * layout_result.y + layout_result.x]  = alpha * inter_results[get_local_id(0)]; \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+namespace detail
+{
+  /** @brief Generate kernel for C = A * B with A being a compressed_matrix, B and C dense */
+  template<typename StringT>
+  void generate_coordinate_matrix_dense_matrix_mul(StringT & source, std::string const & numeric_string,
+                                                   bool B_transposed, bool B_row_major, bool C_row_major)
+  {
+    source.append("__kernel void ");
+    source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
+    source.append("( \n");
+    source.append("  __global const uint2 * coords,  \n");//(row_index, column_index)
+    source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+    source.append("  __global const uint  * group_boundaries, \n");
+    source.append("  __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
+    source.append("  unsigned int d_mat_row_start, \n");
+    source.append("  unsigned int d_mat_col_start, \n");
+    source.append("  unsigned int d_mat_row_inc, \n");
+    source.append("  unsigned int d_mat_col_inc, \n");
+    source.append("  unsigned int d_mat_row_size, \n");
+    source.append("  unsigned int d_mat_col_size, \n");
+    source.append("  unsigned int d_mat_internal_rows, \n");
+    source.append("  unsigned int d_mat_internal_cols, \n");
+    source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+    source.append("  unsigned int result_row_start, \n");
+    source.append("  unsigned int result_col_start, \n");
+    source.append("  unsigned int result_row_inc, \n");
+    source.append("  unsigned int result_col_inc, \n");
+    source.append("  unsigned int result_row_size, \n");
+    source.append("  unsigned int result_col_size, \n");
+    source.append("  unsigned int result_internal_rows, \n");
+    source.append("  unsigned int result_internal_cols, \n");
+    source.append("  __local unsigned int * shared_rows, \n");
+    source.append("  __local "); source.append(numeric_string); source.append(" * inter_results) \n");
+    source.append("{ \n");
+    source.append("  uint2 tmp; \n");
+    source.append("  "); source.append(numeric_string); source.append(" val; \n");
+    source.append("  uint group_start = group_boundaries[get_group_id(0)]; \n");
+    source.append("  uint group_end   = group_boundaries[get_group_id(0) + 1]; \n");
+    source.append("  uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");   // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
+
+    source.append("  uint local_index = 0; \n");
+
+    source.append("  for (uint result_col = 0; result_col < result_col_size; ++result_col) { \n");
+    source.append("   for (uint k = 0; k < k_end; ++k) { \n");
+    source.append("    local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
+
+    source.append("    tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
+    if (B_transposed && B_row_major)
+      source.append("    val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start +      tmp.y * d_mat_col_inc ] : 0; \n");
+    else if (B_transposed && !B_row_major)
+      source.append("    val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc)                       + (d_mat_col_start +      tmp.y * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
+    else if (!B_transposed && B_row_major)
+      source.append("    val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start +      tmp.y * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + result_col * d_mat_col_inc ] : 0; \n");
+    else
+      source.append("    val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start +      tmp.y * d_mat_row_inc)                       + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
+
+    //check for carry from previous loop run:
+    source.append("    if (get_local_id(0) == 0 && k > 0) { \n");
+    source.append("      if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
+    source.append("        val += inter_results[get_local_size(0)-1]; \n");
+    source.append("      else \n");
+    if (C_row_major)
+      source.append("        result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_size(0)-1]; \n");
+    else
+      source.append("        result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start)                        + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_size(0)-1]; \n");
+    source.append("    } \n");
+
+    //segmented parallel reduction begin
+    source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+    source.append("    shared_rows[get_local_id(0)] = tmp.x; \n");
+    source.append("    inter_results[get_local_id(0)] = val; \n");
+    source.append("    "); source.append(numeric_string); source.append(" left = 0; \n");
+    source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+    source.append("    for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+    source.append("      left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
+    source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+    source.append("      inter_results[get_local_id(0)] += left; \n");
+    source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+    source.append("    } \n");
+    //segmented parallel reduction end
+
+    source.append("    if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
+    source.append("      shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
+    if (C_row_major)
+      source.append("      result[(tmp.x * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
+    else
+      source.append("      result[(tmp.x * result_row_inc + result_row_start)                        + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
+    source.append("    } \n");
+
+    source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+    source.append("   }  \n"); //for k
+
+    source.append("   if (local_index + 1 == group_end) \n");  //write results of last active entry (this may not necessarily be the case already)
+    if (C_row_major)
+      source.append("    result[(tmp.x  * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
+    else
+      source.append("    result[(tmp.x  * result_row_inc + result_row_start)                        + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
+    source.append("  } \n"); //for result_col
+    source.append("} \n");
+
+  }
+}
+
+template<typename StringT>
+void generate_coordinate_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
+{
+  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, false, false);
+  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, false,  true);
+  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false,  true, false);
+  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false,  true,  true);
+
+  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, false, false);
+  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, false,  true);
+  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true,  true, false);
+  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true,  true,  true);
+}
+
+template<typename StringT>
+void generate_coordinate_matrix_row_info_extractor(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void row_info_extractor( \n");
+  source.append("          __global const uint2 * coords,  \n");//(row_index, column_index)
+  source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("          __global const uint  * group_boundaries, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("          unsigned int option, \n");
+  source.append("          __local unsigned int * shared_rows, \n");
+  source.append("          __local "); source.append(numeric_string); source.append(" * inter_results) \n");
+  source.append("{ \n");
+  source.append("  uint2 tmp; \n");
+  source.append("  "); source.append(numeric_string); source.append(" val; \n");
+  source.append("  uint last_index  = get_local_size(0) - 1; \n");
+  source.append("  uint group_start = group_boundaries[get_group_id(0)]; \n");
+  source.append("  uint group_end   = group_boundaries[get_group_id(0) + 1]; \n");
+  source.append("  uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : ("); source.append(numeric_string); source.append(")0; \n");   // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
+
+  source.append("  uint local_index = 0; \n");
+
+  source.append("  for (uint k = 0; k < k_end; ++k) \n");
+  source.append("  { \n");
+  source.append("    local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
+
+  source.append("    tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
+  source.append("    val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0; \n");
+
+      //check for carry from previous loop run:
+  source.append("    if (get_local_id(0) == 0 && k > 0) \n");
+  source.append("    { \n");
+  source.append("      if (tmp.x == shared_rows[last_index]) \n");
+  source.append("      { \n");
+  source.append("        switch (option) \n");
+  source.append("        { \n");
+  source.append("          case 0: \n"); //inf-norm
+  source.append("          case 3: \n"); //diagonal entry
+  source.append("            val = max(val, fabs(inter_results[last_index])); \n");
+  source.append("            break; \n");
+
+  source.append("          case 1: \n"); //1-norm
+  source.append("            val = fabs(val) + inter_results[last_index]; \n");
+  source.append("            break; \n");
+
+  source.append("          case 2: \n"); //2-norm
+  source.append("            val = sqrt(val * val + inter_results[last_index]); \n");
+  source.append("            break; \n");
+
+  source.append("          default: \n");
+  source.append("            break; \n");
+  source.append("        } \n");
+  source.append("      } \n");
+  source.append("      else \n");
+  source.append("      { \n");
+  source.append("        switch (option) \n");
+  source.append("        { \n");
+  source.append("          case 0: \n"); //inf-norm
+  source.append("          case 1: \n"); //1-norm
+  source.append("          case 3: \n"); //diagonal entry
+  source.append("            result[shared_rows[last_index]] = inter_results[last_index]; \n");
+  source.append("            break; \n");
+
+  source.append("          case 2: \n"); //2-norm
+  source.append("            result[shared_rows[last_index]] = sqrt(inter_results[last_index]); \n");
+  source.append("          default: \n");
+  source.append("            break; \n");
+  source.append("        } \n");
+  source.append("      } \n");
+  source.append("    } \n");
+
+      //segmented parallel reduction begin
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    shared_rows[get_local_id(0)] = tmp.x; \n");
+  source.append("    switch (option) \n");
+  source.append("    { \n");
+  source.append("      case 0: \n");
+  source.append("      case 3: \n");
+  source.append("        inter_results[get_local_id(0)] = val; \n");
+  source.append("        break; \n");
+  source.append("      case 1: \n");
+  source.append("        inter_results[get_local_id(0)] = fabs(val); \n");
+  source.append("        break; \n");
+  source.append("      case 2: \n");
+  source.append("        inter_results[get_local_id(0)] = val * val; \n");
+  source.append("      default: \n");
+  source.append("        break; \n");
+  source.append("    } \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) \n");
+  source.append("    { \n");
+  source.append("      "); source.append(numeric_string); source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : ("); source.append(numeric_string); source.append(")0; \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("      switch (option) \n");
+  source.append("      { \n");
+  source.append("        case 0: \n"); //inf-norm
+  source.append("        case 3: \n"); //diagonal entry
+  source.append("          inter_results[get_local_id(0)] = max(inter_results[get_local_id(0)], left); \n");
+  source.append("          break; \n");
+
+  source.append("        case 1: \n"); //1-norm
+  source.append("          inter_results[get_local_id(0)] += left; \n");
+  source.append("          break; \n");
+
+  source.append("        case 2: \n"); //2-norm
+  source.append("          inter_results[get_local_id(0)] += left; \n");
+  source.append("          break; \n");
+
+  source.append("        default: \n");
+  source.append("          break; \n");
+  source.append("      } \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    } \n");
+      //segmented parallel reduction end
+
+  source.append("    if (get_local_id(0) != last_index && \n");
+  source.append("        shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1] && \n");
+  source.append("        inter_results[get_local_id(0)] != 0) \n");
+  source.append("    { \n");
+  source.append("      result[tmp.x] = (option == 2) ? sqrt(inter_results[get_local_id(0)]) : inter_results[get_local_id(0)]; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("  } \n"); //for k
+
+  source.append("  if (local_index + 1 == group_end && inter_results[get_local_id(0)] != 0) \n");
+  source.append("    result[tmp.x] = (option == 2) ? sqrt(inter_results[get_local_id(0)]) : inter_results[get_local_id(0)]; \n");
+  source.append("} \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for coordinate_matrix. */
+template<typename NumericT>
+struct coordinate_matrix
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_coordinate_matrix";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(1024);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      generate_coordinate_matrix_vec_mul(source, numeric_string);
+      generate_coordinate_matrix_dense_matrix_multiplication(source, numeric_string);
+      generate_coordinate_matrix_row_info_extractor(source, numeric_string);
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ell_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ell_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ell_matrix.hpp
new file mode 100644
index 0000000..23c6af9
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ell_matrix.hpp
@@ -0,0 +1,221 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_ELL_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_ELL_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/ell_matrix.hpp
+ *  @brief OpenCL kernel file for ell_matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_ell_vec_mul(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+  if (with_alpha_beta)
+    source.append("__kernel void vec_mul_alpha_beta( \n");
+  else
+    source.append("__kernel void vec_mul( \n");
+  source.append("  __global const unsigned int * coords, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("  uint4 layout_x, \n");
+  if (with_alpha_beta) { source.append("  "); source.append(numeric_string); source.append(" alpha, \n"); }
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  uint4 layout_result, \n");
+  if (with_alpha_beta) { source.append("  "); source.append(numeric_string); source.append(" beta, \n"); }
+  source.append("  unsigned int row_num, \n");
+  source.append("  unsigned int col_num, \n");
+  source.append("  unsigned int internal_row_num, \n");
+  source.append("  unsigned int items_per_row, \n");
+  source.append("  unsigned int aligned_items_per_row) \n");
+  source.append("{ \n");
+  source.append("  uint glb_id = get_global_id(0); \n");
+  source.append("  uint glb_sz = get_global_size(0); \n");
+
+  source.append("  for (uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+  source.append("    uint offset = row_id; \n");
+  source.append("    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+  source.append("      "); source.append(numeric_string); source.append(" val = elements[offset]; \n");
+
+  source.append("       if (val != 0.0f) { \n");
+  source.append("          int col = coords[offset]; \n");
+  source.append("          sum += (x[col * layout_x.y + layout_x.x] * val); \n");
+  source.append("       } \n");
+
+  source.append("    } \n");
+
+  if (with_alpha_beta)
+    source.append("    result[row_id * layout_result.y + layout_result.x] = alpha * sum + ((beta != 0) ? beta * result[row_id * layout_result.y + layout_result.x] : 0); \n");
+  else
+    source.append("    result[row_id * layout_result.y + layout_result.x] = sum; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+namespace detail
+{
+  template<typename StringT>
+  void generate_ell_matrix_dense_matrix_mul(StringT & source, std::string const & numeric_string,
+                                            bool B_transposed, bool B_row_major, bool C_row_major)
+  {
+    source.append("__kernel void ");
+    source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
+    source.append("( \n");
+    source.append("    __global const unsigned int * sp_mat_coords, \n");
+    source.append("    __global const "); source.append(numeric_string); source.append(" * sp_mat_elems, \n");
+    source.append("    unsigned int sp_mat_row_num, \n");
+    source.append("    unsigned int sp_mat_col_num, \n");
+    source.append("    unsigned int sp_mat_internal_row_num, \n");
+    source.append("    unsigned int sp_mat_items_per_row, \n");
+    source.append("    unsigned int sp_mat_aligned_items_per_row, \n");
+    source.append("    __global const "); source.append(numeric_string); source.append("* d_mat, \n");
+    source.append("    unsigned int d_mat_row_start, \n");
+    source.append("    unsigned int d_mat_col_start, \n");
+    source.append("    unsigned int d_mat_row_inc, \n");
+    source.append("    unsigned int d_mat_col_inc, \n");
+    source.append("    unsigned int d_mat_row_size, \n");
+    source.append("    unsigned int d_mat_col_size, \n");
+    source.append("    unsigned int d_mat_internal_rows, \n");
+    source.append("    unsigned int d_mat_internal_cols, \n");
+    source.append("    __global "); source.append(numeric_string); source.append(" * result, \n");
+    source.append("    unsigned int result_row_start, \n");
+    source.append("    unsigned int result_col_start, \n");
+    source.append("    unsigned int result_row_inc, \n");
+    source.append("    unsigned int result_col_inc, \n");
+    source.append("    unsigned int result_row_size, \n");
+    source.append("    unsigned int result_col_size, \n");
+    source.append("    unsigned int result_internal_rows, \n");
+    source.append("    unsigned int result_internal_cols) { \n");
+
+    source.append("    uint glb_id = get_global_id(0); \n");
+    source.append("    uint glb_sz = get_global_size(0); \n");
+
+    source.append("    for ( uint rc = glb_id; rc < (sp_mat_row_num * result_col_size); rc += glb_sz) { \n");
+    source.append("      uint row = rc % sp_mat_row_num; \n");
+    source.append("      uint col = rc / sp_mat_row_num; \n");
+
+    source.append("      uint offset = row; \n");
+    source.append("      "); source.append(numeric_string); source.append(" r = ("); source.append(numeric_string); source.append(")0; \n");
+
+    source.append("      for ( uint k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num) { \n");
+
+    source.append("        uint j = sp_mat_coords[offset]; \n");
+    source.append("        "); source.append(numeric_string); source.append(" x = sp_mat_elems[offset]; \n");
+
+    source.append("        if (x != ("); source.append(numeric_string); source.append(")0) { \n");
+    source.append("          "); source.append(numeric_string);
+    if (B_transposed && B_row_major)
+      source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + j * d_mat_col_inc ]; \n");
+    else if (B_transposed && !B_row_major)
+      source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc)                       + (d_mat_col_start + j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+    else if (!B_transposed && B_row_major)
+      source.append(" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
+    else
+      source.append(" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc)                       + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+
+    source.append("          r += x*y; \n");
+    source.append("        } \n");
+    source.append("      } \n");
+
+    if (C_row_major)
+      source.append("      result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
+    else
+      source.append("      result[ (result_row_start + row * result_row_inc)                        + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
+    source.append("    } \n");
+    source.append("} \n");
+
+  }
+}
+
+template<typename StringT>
+void generate_ell_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
+{
+  detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, false, false, false);
+  detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, false, false,  true);
+  detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, false,  true, false);
+  detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, false,  true,  true);
+
+  detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, true, false, false);
+  detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, true, false,  true);
+  detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, true,  true, false);
+  detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, true,  true,  true);
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for ell_matrix. */
+template<typename NumericT>
+struct ell_matrix
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_ell_matrix";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(1024);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // fully parameterized kernels:
+      generate_ell_vec_mul(source, numeric_string, true);
+      generate_ell_vec_mul(source, numeric_string, false);
+      generate_ell_matrix_dense_matrix_multiplication(source, numeric_string);
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/fft.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/fft.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/fft.hpp
new file mode 100644
index 0000000..1447bd1
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/fft.hpp
@@ -0,0 +1,311 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/fft.hpp
+ *  @brief OpenCL kernel file for FFT operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+
+// Postprocessing phase of Bluestein algorithm
+template<typename StringT>
+void generate_fft_bluestein_post(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void bluestein_post(__global "); source.append(numeric_string); source.append("2 *Z, \n");
+  source.append("                             __global "); source.append(numeric_string); source.append("2 *out, \n");
+  source.append("                             unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  unsigned int glb_id = get_global_id(0); \n");
+  source.append("  unsigned int glb_sz = get_global_size(0); \n");
+
+  source.append("  unsigned int double_size = size << 1; \n");
+  source.append("  "); source.append(numeric_string); source.append(" sn_a, cs_a; \n");
+  source.append("  const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+
+  source.append("  for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
+  source.append("    unsigned int rm = i * i % (double_size); \n");
+  source.append("    "); source.append(numeric_string); source.append(" angle = ("); source.append(numeric_string); source.append(")rm / size * (-NUM_PI); \n");
+
+  source.append("    sn_a = sincos(angle, &cs_a); \n");
+
+  source.append("    "); source.append(numeric_string); source.append("2 b_i = ("); source.append(numeric_string); source.append("2)(cs_a, sn_a); \n");
+  source.append("    out[i] = ("); source.append(numeric_string); source.append("2)(Z[i].x * b_i.x - Z[i].y * b_i.y, Z[i].x * b_i.y + Z[i].y * b_i.x); \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+// Preprocessing phase of Bluestein algorithm
+template<typename StringT>
+void generate_fft_bluestein_pre(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void bluestein_pre(__global "); source.append(numeric_string); source.append("2 *input, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("2 *A, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("2 *B, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  unsigned int ext_size \n");
+  source.append("  ) { \n");
+  source.append("  unsigned int glb_id = get_global_id(0); \n");
+  source.append("  unsigned int glb_sz = get_global_size(0); \n");
+
+  source.append("  unsigned int double_size = size << 1; \n");
+
+  source.append("  "); source.append(numeric_string); source.append(" sn_a, cs_a; \n");
+  source.append("  const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+
+  source.append("  for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
+  source.append("    unsigned int rm = i * i % (double_size); \n");
+  source.append("    "); source.append(numeric_string); source.append(" angle = ("); source.append(numeric_string); source.append(")rm / size * NUM_PI; \n");
+
+  source.append("    sn_a = sincos(-angle, &cs_a); \n");
+
+  source.append("    "); source.append(numeric_string); source.append("2 a_i = ("); source.append(numeric_string); source.append("2)(cs_a, sn_a); \n");
+  source.append("    "); source.append(numeric_string); source.append("2 b_i = ("); source.append(numeric_string); source.append("2)(cs_a, -sn_a); \n");
+
+  source.append("    A[i] = ("); source.append(numeric_string); source.append("2)(input[i].x * a_i.x - input[i].y * a_i.y, input[i].x * a_i.y + input[i].y * a_i.x); \n");
+  source.append("    B[i] = b_i; \n");
+
+          // very bad instruction, to be fixed
+  source.append("    if (i) \n");
+  source.append("      B[ext_size - i] = b_i; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+/** @brief Extract real part of a complex number array */
+template<typename StringT>
+void generate_fft_complex_to_real(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void complex_to_real(__global "); source.append(numeric_string); source.append("2 *in, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("  *out, \n");
+  source.append("  unsigned int size) { \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))  \n");
+  source.append("    out[i] = in[i].x; \n");
+  source.append("} \n");
+}
+
+/** @brief OpenCL kernel generation code for dividing a complex number by a real number */
+template<typename StringT>
+void generate_fft_div_vec_scalar(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void fft_div_vec_scalar(__global "); source.append(numeric_string); source.append("2 *input1, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  "); source.append(numeric_string); source.append(" factor) { \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))  \n");
+  source.append("    input1[i] /= factor; \n");
+  source.append("} \n");
+}
+
+/** @brief Elementwise product of two complex vectors */
+template<typename StringT>
+void generate_fft_mult_vec(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void fft_mult_vec(__global const "); source.append(numeric_string); source.append("2 *input1, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append("2 *input2, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("2 *output, \n");
+  source.append("  unsigned int size) { \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+  source.append("    "); source.append(numeric_string); source.append("2 in1 = input1[i]; \n");
+  source.append("    "); source.append(numeric_string); source.append("2 in2 = input2[i]; \n");
+
+  source.append("    output[i] = ("); source.append(numeric_string); source.append("2)(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x); \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+/** @brief Embedds a real-valued vector into a complex one */
+template<typename StringT>
+void generate_fft_real_to_complex(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void real_to_complex(__global "); source.append(numeric_string); source.append(" *in, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("2 *out, \n");
+  source.append("  unsigned int size) { \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+  source.append("    "); source.append(numeric_string); source.append("2 val = 0; \n");
+  source.append("    val.x = in[i]; \n");
+  source.append("    out[i] = val; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+/** @brief Reverses the entries in a vector */
+template<typename StringT>
+void generate_fft_reverse_inplace(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void reverse_inplace(__global "); source.append(numeric_string); source.append(" *vec, uint size) { \n");
+  source.append("  for (uint i = get_global_id(0); i < (size >> 1); i+=get_global_size(0)) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" val1 = vec[i]; \n");
+  source.append("    "); source.append(numeric_string); source.append(" val2 = vec[size - i - 1]; \n");
+
+  source.append("    vec[i] = val2; \n");
+  source.append("    vec[size - i - 1] = val1; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+/** @brief Simplistic matrix transpose function */
+template<typename StringT>
+void generate_fft_transpose(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void transpose(__global "); source.append(numeric_string); source.append("2 *input, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("2 *output, \n");
+  source.append("  unsigned int row_num, \n");
+  source.append("  unsigned int col_num) { \n");
+  source.append("  unsigned int size = row_num * col_num; \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
+  source.append("    unsigned int row = i / col_num; \n");
+  source.append("    unsigned int col = i - row*col_num; \n");
+
+  source.append("    unsigned int new_pos = col * row_num + row; \n");
+
+  source.append("    output[new_pos] = input[i]; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+/** @brief Simplistic inplace matrix transpose function */
+template<typename StringT>
+void generate_fft_transpose_inplace(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void transpose_inplace(__global "); source.append(numeric_string); source.append("2* input, \n");
+  source.append("  unsigned int row_num, \n");
+  source.append("  unsigned int col_num) { \n");
+  source.append("  unsigned int size = row_num * col_num; \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
+  source.append("    unsigned int row = i / col_num; \n");
+  source.append("    unsigned int col = i - row*col_num; \n");
+
+  source.append("    unsigned int new_pos = col * row_num + row; \n");
+
+  source.append("    if (i < new_pos) { \n");
+  source.append("      "); source.append(numeric_string); source.append("2 val = input[i]; \n");
+  source.append("      input[i] = input[new_pos]; \n");
+  source.append("      input[new_pos] = val; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+/** @brief Computes the matrix vector product with a Vandermonde matrix */
+template<typename StringT>
+void generate_fft_vandermonde_prod(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void vandermonde_prod(__global "); source.append(numeric_string); source.append(" *vander, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" *vector, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" *result, \n");
+  source.append("  uint size) { \n");
+  source.append("  for (uint i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" mul = vander[i]; \n");
+  source.append("    "); source.append(numeric_string); source.append(" pwr = 1; \n");
+  source.append("    "); source.append(numeric_string); source.append(" val = 0; \n");
+
+  source.append("    for (uint j = 0; j < size; j++) { \n");
+  source.append("      val = val + pwr * vector[j]; \n");
+  source.append("      pwr *= mul; \n");
+  source.append("    } \n");
+
+  source.append("    result[i] = val; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+/** @brief Zero two complex vectors (to avoid kernel launch overhead) */
+template<typename StringT>
+void generate_fft_zero2(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void zero2(__global "); source.append(numeric_string); source.append("2 *input1, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("2 *input2, \n");
+  source.append("  unsigned int size) { \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+  source.append("    input1[i] = 0; \n");
+  source.append("    input2[i] = 0; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for the fast Fourier transform. */
+template<typename NumericT>
+struct fft
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_fft";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(8192);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // unary operations
+      if (numeric_string == "float" || numeric_string == "double")
+      {
+        generate_fft_bluestein_post(source, numeric_string);
+        generate_fft_bluestein_pre(source, numeric_string);
+        generate_fft_complex_to_real(source, numeric_string);
+        generate_fft_div_vec_scalar(source, numeric_string);
+        generate_fft_mult_vec(source, numeric_string);
+        generate_fft_real_to_complex(source, numeric_string);
+        generate_fft_reverse_inplace(source, numeric_string);
+        generate_fft_transpose(source, numeric_string);
+        generate_fft_transpose_inplace(source, numeric_string);
+        generate_fft_vandermonde_prod(source, numeric_string);
+        generate_fft_zero2(source, numeric_string);
+      }
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/hyb_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/hyb_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/hyb_matrix.hpp
new file mode 100644
index 0000000..83d1411
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/hyb_matrix.hpp
@@ -0,0 +1,240 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_HYB_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_HYB_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/hyb_matrix.hpp
+ *  @brief OpenCL kernel file for hyb_matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_hyb_vec_mul(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+  if (with_alpha_beta)
+    source.append("__kernel void vec_mul_alpha_beta( \n");
+  else
+    source.append("__kernel void vec_mul( \n");
+  source.append("  const __global int* ell_coords, \n");
+  source.append("  const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
+  source.append("  const __global uint* csr_rows, \n");
+  source.append("  const __global uint* csr_cols, \n");
+  source.append("  const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
+  source.append("  const __global "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("  uint4 layout_x, \n");
+  if (with_alpha_beta) { source.append("  "); source.append(numeric_string); source.append(" alpha, \n"); }
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  uint4 layout_result, \n");
+  if (with_alpha_beta) { source.append("  "); source.append(numeric_string); source.append(" beta, \n"); }
+  source.append("  unsigned int row_num, \n");
+  source.append("  unsigned int internal_row_num, \n");
+  source.append("  unsigned int items_per_row, \n");
+  source.append("  unsigned int aligned_items_per_row) \n");
+  source.append("{ \n");
+  source.append("  uint glb_id = get_global_id(0); \n");
+  source.append("  uint glb_sz = get_global_size(0); \n");
+
+  source.append("  for (uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+  source.append("    uint offset = row_id; \n");
+  source.append("    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+  source.append("      "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
+
+  source.append("      if (val != ("); source.append(numeric_string); source.append(")0) { \n");
+  source.append("        int col = ell_coords[offset]; \n");
+  source.append("        sum += (x[col * layout_x.y + layout_x.x] * val); \n");
+  source.append("      } \n");
+
+  source.append("    } \n");
+
+  source.append("    uint col_begin = csr_rows[row_id]; \n");
+  source.append("    uint col_end   = csr_rows[row_id + 1]; \n");
+
+  source.append("    for (uint item_id = col_begin; item_id < col_end; item_id++) {  \n");
+  source.append("      sum += (x[csr_cols[item_id] * layout_x.y + layout_x.x] * csr_elements[item_id]); \n");
+  source.append("    } \n");
+
+  if (with_alpha_beta)
+    source.append("    result[row_id * layout_result.y + layout_result.x] = alpha * sum + ((beta != 0) ? beta * result[row_id * layout_result.y + layout_result.x] : 0); \n");
+  else
+    source.append("    result[row_id * layout_result.y + layout_result.x] = sum; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+namespace detail
+{
+  template<typename StringT>
+  void generate_hyb_matrix_dense_matrix_mul(StringT & source, std::string const & numeric_string,
+                                            bool B_transposed, bool B_row_major, bool C_row_major)
+  {
+    source.append("__kernel void ");
+    source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
+    source.append("( \n");
+    source.append("  const __global int* ell_coords, \n");
+    source.append("  const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
+    source.append("  const __global uint* csr_rows, \n");
+    source.append("  const __global uint* csr_cols, \n");
+    source.append("  const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
+    source.append("  unsigned int row_num, \n");
+    source.append("  unsigned int internal_row_num, \n");
+    source.append("  unsigned int items_per_row, \n");
+    source.append("  unsigned int aligned_items_per_row, \n");
+    source.append("    __global const "); source.append(numeric_string); source.append("* d_mat, \n");
+    source.append("    unsigned int d_mat_row_start, \n");
+    source.append("    unsigned int d_mat_col_start, \n");
+    source.append("    unsigned int d_mat_row_inc, \n");
+    source.append("    unsigned int d_mat_col_inc, \n");
+    source.append("    unsigned int d_mat_row_size, \n");
+    source.append("    unsigned int d_mat_col_size, \n");
+    source.append("    unsigned int d_mat_internal_rows, \n");
+    source.append("    unsigned int d_mat_internal_cols, \n");
+    source.append("    __global "); source.append(numeric_string); source.append(" * result, \n");
+    source.append("    unsigned int result_row_start, \n");
+    source.append("    unsigned int result_col_start, \n");
+    source.append("    unsigned int result_row_inc, \n");
+    source.append("    unsigned int result_col_inc, \n");
+    source.append("    unsigned int result_row_size, \n");
+    source.append("    unsigned int result_col_size, \n");
+    source.append("    unsigned int result_internal_rows, \n");
+    source.append("    unsigned int result_internal_cols) { \n");
+
+    source.append("  uint glb_id = get_global_id(0); \n");
+    source.append("  uint glb_sz = get_global_size(0); \n");
+
+    source.append("  for (uint result_col = 0; result_col < result_col_size; ++result_col) { \n");
+    source.append("   for (uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
+    source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+    source.append("    uint offset = row_id; \n");
+    source.append("    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+    source.append("      "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
+
+    source.append("      if (val != ("); source.append(numeric_string); source.append(")0) { \n");
+    source.append("        int col = ell_coords[offset]; \n");
+    if (B_transposed && B_row_major)
+      source.append("      sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start +        col * d_mat_col_inc                        ] * val; \n");
+    else if (B_transposed && !B_row_major)
+      source.append("      sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc)                       + (d_mat_col_start +        col * d_mat_col_inc) * d_mat_internal_rows ] * val; \n");
+    else if (!B_transposed && B_row_major)
+      source.append("      sum += d_mat[ (d_mat_row_start +        col * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start + result_col * d_mat_col_inc                        ] * val; \n");
+    else
+      source.append("      sum += d_mat[ (d_mat_row_start +        col * d_mat_row_inc)                       + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] * val; \n");
+    source.append("      } \n");
+
+    source.append("    } \n");
+
+    source.append("    uint col_begin = csr_rows[row_id]; \n");
+    source.append("    uint col_end   = csr_rows[row_id + 1]; \n");
+
+    source.append("    for (uint item_id = col_begin; item_id < col_end; item_id++) {  \n");
+    if (B_transposed && B_row_major)
+      source.append("      sum += d_mat[ (d_mat_row_start +        result_col * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start + csr_cols[item_id] * d_mat_col_inc                        ] * csr_elements[item_id]; \n");
+    else if (B_transposed && !B_row_major)
+      source.append("      sum += d_mat[ (d_mat_row_start +        result_col * d_mat_row_inc)                       + (d_mat_col_start + csr_cols[item_id] * d_mat_col_inc) * d_mat_internal_rows ] * csr_elements[item_id]; \n");
+    else if (!B_transposed && B_row_major)
+      source.append("      sum += d_mat[ (d_mat_row_start + csr_cols[item_id] * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start +        result_col * d_mat_col_inc                        ] * csr_elements[item_id]; \n");
+    else
+      source.append("      sum += d_mat[ (d_mat_row_start + csr_cols[item_id] * d_mat_row_inc)                       + (d_mat_col_start +        result_col * d_mat_col_inc) * d_mat_internal_rows ] * csr_elements[item_id]; \n");
+    source.append("    } \n");
+
+    if (C_row_major)
+      source.append("      result[ (result_row_start + row_id * result_row_inc) * result_internal_cols + result_col_start + result_col * result_col_inc ] = sum; \n");
+    else
+      source.append("      result[ (result_row_start + row_id * result_row_inc)                        + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = sum; \n");
+    source.append("   } \n");
+    source.append("  } \n");
+    source.append("} \n");
+  }
+}
+
+template<typename StringT>
+void generate_hyb_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
+{
+  detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, false, false, false);
+  detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, false, false,  true);
+  detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, false,  true, false);
+  detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, false,  true,  true);
+
+  detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, true, false, false);
+  detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, true, false,  true);
+  detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, true,  true, false);
+  detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, true,  true,  true);
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for hyb_matrix. */
+template<typename NumericT>
+struct hyb_matrix
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_hyb_matrix";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(1024);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      generate_hyb_vec_mul(source, numeric_string, true);
+      generate_hyb_vec_mul(source, numeric_string, false);
+      generate_hyb_matrix_dense_matrix_multiplication(source, numeric_string);
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ilu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ilu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ilu.hpp
new file mode 100644
index 0000000..bef778c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ilu.hpp
@@ -0,0 +1,505 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_ILU_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_ILU_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/ilu.hpp
+ *  @brief OpenCL kernel file for nonnegative matrix factorization */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+template<typename StringT>
+void generate_ilu_level_scheduling_substitute(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void level_scheduling_substitute( \n");
+  source.append("  __global const unsigned int * row_index_array, \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vec, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int row  = get_global_id(0); \n");
+  source.append("                    row  < size; \n");
+  source.append("                    row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    unsigned int eq_row = row_index_array[row]; \n");
+  source.append("    "); source.append(numeric_string); source.append(" vec_entry = vec[eq_row]; \n");
+  source.append("    unsigned int row_end = row_indices[row+1]; \n");
+
+  source.append("    for (unsigned int j = row_indices[row]; j < row_end; ++j) \n");
+  source.append("      vec_entry -= vec[column_indices[j]] * elements[j]; \n");
+
+  source.append("    vec[eq_row] = vec_entry; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+///////////// ICC ///////////////
+
+
+template<typename StringT>
+void generate_icc_extract_L_1(StringT & source)
+{
+  source.append("__kernel void extract_L_1( \n");
+  source.append("  __global unsigned int const *A_row_indices, \n");
+  source.append("  __global unsigned int const *A_col_indices, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  __global unsigned int *L_row_indices) { \n");
+
+  source.append("  for (unsigned int row  = get_global_id(0); \n");
+  source.append("                    row  < A_size1; \n");
+  source.append("                    row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    unsigned int row_begin = A_row_indices[row]; \n");
+  source.append("    unsigned int row_end   = A_row_indices[row+1]; \n");
+
+  source.append("    unsigned int num_entries_L = 0; \n");
+  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
+  source.append("      unsigned int col = A_col_indices[j]; \n");
+  source.append("      if (col <= row) ++num_entries_L; \n");
+  source.append("    } \n");
+
+  source.append("    L_row_indices[row] = num_entries_L;   \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_icc_extract_L_2(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void extract_L_2( \n");
+  source.append("  __global unsigned int const *A_row_indices, \n");
+  source.append("  __global unsigned int const *A_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const *A_elements, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  __global unsigned int const *L_row_indices, \n");
+  source.append("  __global unsigned int       *L_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" *L_elements) { \n");
+
+  source.append("  for (unsigned int row  = get_global_id(0); \n");
+  source.append("                    row  < A_size1; \n");
+  source.append("                    row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    unsigned int row_begin = A_row_indices[row]; \n");
+  source.append("    unsigned int row_end   = A_row_indices[row+1]; \n");
+
+  source.append("    unsigned int index_L = L_row_indices[row]; \n");
+  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
+  source.append("      unsigned int col = A_col_indices[j]; \n");
+  source.append("      "); source.append(numeric_string); source.append(" value = A_elements[j]; \n");
+
+  source.append("      if (col <= row) { \n");
+  source.append("        L_col_indices[index_L] = col; \n");
+  source.append("        L_elements[index_L]    = value; \n");
+  source.append("        ++index_L; \n");
+  source.append("      } \n");
+  source.append("    } \n");
+
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+
+template<typename StringT>
+void generate_icc_chow_patel_sweep_kernel(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void icc_chow_patel_sweep_kernel( \n");
+  source.append("  __global unsigned int const *L_row_indices, \n");
+  source.append("  __global unsigned int const *L_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("       *L_elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const *L_backup, \n");
+  source.append("  unsigned int L_size1, \n");
+
+  source.append("  __global "); source.append(numeric_string); source.append(" const *aij_L) { \n");
+
+  source.append("  for (unsigned int row  = get_global_id(0); \n");
+  source.append("                    row  < L_size1; \n");
+  source.append("                    row += get_global_size(0)) \n");
+  source.append("  { \n");
+
+  //
+  // Update L:
+  //
+  source.append("    unsigned int row_Li_start = L_row_indices[row]; \n");
+  source.append("    unsigned int row_Li_end   = L_row_indices[row + 1]; \n");
+
+  source.append("    for (unsigned int i = row_Li_start; i < row_Li_end; ++i) { \n");
+  source.append("      unsigned int col = L_col_indices[i]; \n");
+
+  source.append("      unsigned int row_Lj_start = L_row_indices[col]; \n");
+  source.append("      unsigned int row_Lj_end   = L_row_indices[col + 1]; \n");
+
+  source.append("      unsigned int index_Lj = row_Lj_start; \n");
+  source.append("      unsigned int col_Lj = L_col_indices[index_Lj]; \n");
+
+  source.append("      "); source.append(numeric_string); source.append(" s = aij_L[i]; \n");
+  source.append("      for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li) { \n");
+  source.append("        unsigned int col_Li = L_col_indices[index_Li]; \n");
+
+  source.append("        while (col_Lj < col_Li) { \n");
+  source.append("          ++index_Lj; \n");
+  source.append("          col_Lj = L_col_indices[index_Lj]; \n");
+  source.append("        } \n");
+
+  source.append("        if (col_Lj == col_Li) \n");
+  source.append("          s -= L_backup[index_Li] * L_backup[index_Lj]; \n");
+  source.append("      } \n");
+
+  // update l_ij:
+  source.append("      L_elements[i] = (row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]); \n");
+  source.append("    } \n");
+
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+
+///////////// ILU ///////////////
+
+template<typename StringT>
+void generate_ilu_extract_LU_1(StringT & source)
+{
+  source.append("__kernel void extract_LU_1( \n");
+  source.append("  __global unsigned int const *A_row_indices, \n");
+  source.append("  __global unsigned int const *A_col_indices, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  __global unsigned int *L_row_indices, \n");
+  source.append("  __global unsigned int *U_row_indices) { \n");
+
+  source.append("  for (unsigned int row  = get_global_id(0); \n");
+  source.append("                    row  < A_size1; \n");
+  source.append("                    row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    unsigned int row_begin = A_row_indices[row]; \n");
+  source.append("    unsigned int row_end   = A_row_indices[row+1]; \n");
+
+  source.append("    unsigned int num_entries_L = 0; \n");
+  source.append("    unsigned int num_entries_U = 0; \n");
+  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
+  source.append("      unsigned int col = A_col_indices[j]; \n");
+  source.append("      if (col <= row) ++num_entries_L; \n");
+  source.append("      if (col >= row) ++num_entries_U; \n");
+  source.append("    } \n");
+
+  source.append("    L_row_indices[row] = num_entries_L;   \n");
+  source.append("    U_row_indices[row] = num_entries_U;   \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_ilu_extract_LU_2(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void extract_LU_2( \n");
+  source.append("  __global unsigned int const *A_row_indices, \n");
+  source.append("  __global unsigned int const *A_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const *A_elements, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  __global unsigned int const *L_row_indices, \n");
+  source.append("  __global unsigned int       *L_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" *L_elements, \n");
+  source.append("  __global unsigned int const *U_row_indices, \n");
+  source.append("  __global unsigned int       *U_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" *U_elements) { \n");
+
+  source.append("  for (unsigned int row  = get_global_id(0); \n");
+  source.append("                    row  < A_size1; \n");
+  source.append("                    row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    unsigned int row_begin = A_row_indices[row]; \n");
+  source.append("    unsigned int row_end   = A_row_indices[row+1]; \n");
+
+  source.append("    unsigned int index_L = L_row_indices[row]; \n");
+  source.append("    unsigned int index_U = U_row_indices[row]; \n");
+  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
+  source.append("      unsigned int col = A_col_indices[j]; \n");
+  source.append("      "); source.append(numeric_string); source.append(" value = A_elements[j]; \n");
+
+  source.append("      if (col <= row) { \n");
+  source.append("        L_col_indices[index_L] = col; \n");
+  source.append("        L_elements[index_L]    = value; \n");
+  source.append("        ++index_L; \n");
+  source.append("      } \n");
+  source.append("      if (col >= row) { \n");
+  source.append("        U_col_indices[index_U] = col; \n");
+  source.append("        U_elements[index_U]    = value; \n");
+  source.append("        ++index_U; \n");
+  source.append("      } \n");
+  source.append("    } \n");
+
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_ilu_scale_kernel_1(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void ilu_scale_kernel_1( \n");
+  source.append("  __global unsigned int const *A_row_indices, \n");
+  source.append("  __global unsigned int const *A_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const *A_elements, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("       *D_elements) { \n");
+
+  source.append("  for (unsigned int row  = get_global_id(0); \n");
+  source.append("                    row  < A_size1; \n");
+  source.append("                    row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    unsigned int row_begin = A_row_indices[row]; \n");
+  source.append("    unsigned int row_end   = A_row_indices[row+1]; \n");
+
+  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
+  source.append("      unsigned int col = A_col_indices[j]; \n");
+
+  source.append("      if (col == row) { \n");
+  source.append("        D_elements[row] = 1 / sqrt(fabs(A_elements[j])); \n");
+  source.append("        break; \n");
+  source.append("      } \n");
+  source.append("    } \n");
+
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_ilu_scale_kernel_2(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void ilu_scale_kernel_2( \n");
+  source.append("  __global unsigned int const *R_row_indices, \n");
+  source.append("  __global unsigned int const *R_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("       *R_elements, \n");
+  source.append("  unsigned int R_size1, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const *D_elements) { \n");
+
+  source.append("  for (unsigned int row  = get_global_id(0); \n");
+  source.append("                    row  < R_size1; \n");
+  source.append("                    row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    unsigned int row_begin = R_row_indices[row]; \n");
+  source.append("    unsigned int row_end   = R_row_indices[row+1]; \n");
+
+  source.append("    "); source.append(numeric_string); source.append(" D_row = D_elements[row]; \n");
+  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) \n");
+  source.append("      R_elements[j] *= D_row * D_elements[R_col_indices[j]]; \n");
+
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_ilu_chow_patel_sweep_kernel(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void ilu_chow_patel_sweep_kernel( \n");
+  source.append("  __global unsigned int const *L_row_indices, \n");
+  source.append("  __global unsigned int const *L_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("       *L_elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const *L_backup, \n");
+  source.append("  unsigned int L_size1, \n");
+
+  source.append("  __global "); source.append(numeric_string); source.append(" const *aij_L, \n");
+
+  source.append("  __global unsigned int const *U_trans_row_indices, \n");
+  source.append("  __global unsigned int const *U_trans_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("       *U_trans_elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const *U_trans_backup, \n");
+
+  source.append("  __global "); source.append(numeric_string); source.append(" const *aij_U_trans) { \n");
+
+  source.append("  for (unsigned int row  = get_global_id(0); \n");
+  source.append("                    row  < L_size1; \n");
+  source.append("                    row += get_global_size(0)) \n");
+  source.append("  { \n");
+
+  //
+  // Update L:
+  //
+  source.append("    unsigned int row_L_start = L_row_indices[row]; \n");
+  source.append("    unsigned int row_L_end   = L_row_indices[row + 1]; \n");
+
+  source.append("    for (unsigned int j = row_L_start; j < row_L_end; ++j) { \n");
+  source.append("      unsigned int col = L_col_indices[j]; \n");
+
+  source.append("      if (col == row) continue; \n");
+
+  source.append("      unsigned int row_U_start = U_trans_row_indices[col]; \n");
+  source.append("      unsigned int row_U_end   = U_trans_row_indices[col + 1]; \n");
+
+  source.append("      unsigned int index_U = row_U_start; \n");
+  source.append("      unsigned int col_U = (index_U < row_U_end) ? U_trans_col_indices[index_U] : L_size1; \n");
+
+  source.append("      "); source.append(numeric_string); source.append(" sum = 0; \n");
+  source.append("      for (unsigned int k = row_L_start; k < j; ++k) { \n");
+  source.append("        unsigned int col_L = L_col_indices[k]; \n");
+
+  source.append("        while (col_U < col_L) { \n");
+  source.append("          ++index_U; \n");
+  source.append("          col_U = U_trans_col_indices[index_U]; \n");
+  source.append("        } \n");
+
+  source.append("        if (col_U == col_L) \n");
+  source.append("          sum += L_backup[k] * U_trans_backup[index_U]; \n");
+  source.append("      } \n");
+
+  // update l_ij:
+  source.append("      L_elements[j] = (aij_L[j] - sum) / U_trans_backup[row_U_end - 1]; \n");
+  source.append("    } \n");
+
+  //
+  // Update U:
+  //
+  source.append("    unsigned int row_U_start = U_trans_row_indices[row]; \n");
+  source.append("    unsigned int row_U_end   = U_trans_row_indices[row + 1]; \n");
+
+  source.append("    for (unsigned int j = row_U_start; j < row_U_end; ++j) { \n");
+  source.append("      unsigned int col = U_trans_col_indices[j]; \n");
+
+  source.append("      row_L_start = L_row_indices[col]; \n");
+  source.append("      row_L_end   = L_row_indices[col + 1]; \n");
+
+  // compute \sum_{k=1}^{j-1} l_ik u_kj
+  source.append("      unsigned int index_L = row_L_start; \n");
+  source.append("      unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : L_size1; \n");
+  source.append("      "); source.append(numeric_string); source.append(" sum = 0; \n");
+  source.append("      for (unsigned int k = row_U_start; k < j; ++k) { \n");
+  source.append("        unsigned int col_U = U_trans_col_indices[k]; \n");
+
+  // find element in L:
+  source.append("        while (col_L < col_U) { \n");
+  source.append("          ++index_L; \n");
+  source.append("          col_L = L_col_indices[index_L]; \n");
+  source.append("        } \n");
+
+  source.append("        if (col_U == col_L) \n");
+  source.append("          sum += L_backup[index_L] * U_trans_backup[k]; \n");
+  source.append("      } \n");
+
+  // update U_ij:
+  source.append("      U_trans_elements[j] = aij_U_trans[j] - sum; \n");
+  source.append("    } \n");
+
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+
+template<typename StringT>
+void generate_ilu_form_neumann_matrix_kernel(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void ilu_form_neumann_matrix_kernel( \n");
+  source.append("  __global unsigned int const *R_row_indices, \n");
+  source.append("  __global unsigned int const *R_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" *R_elements, \n");
+  source.append("  unsigned int R_size1, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" *D_elements) { \n");
+
+  source.append("  for (unsigned int row  = get_global_id(0); \n");
+  source.append("                    row  < R_size1; \n");
+  source.append("                    row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    unsigned int row_begin = R_row_indices[row]; \n");
+  source.append("    unsigned int row_end   = R_row_indices[row+1]; \n");
+
+  // Part 1: Extract and set diagonal entry
+  source.append("    "); source.append(numeric_string); source.append(" diag = D_elements[row]; \n");
+  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
+  source.append("      unsigned int col = R_col_indices[j]; \n");
+  source.append("      if (col == row) { \n");
+  source.append("        diag = R_elements[j]; \n");
+  source.append("        R_elements[j] = 0; \n");
+  source.append("        break; \n");
+  source.append("      } \n");
+  source.append("    } \n");
+  source.append("    D_elements[row] = diag; \n");
+
+  // Part 2: Scale
+  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) \n");
+  source.append("      R_elements[j] /= -diag; \n");
+
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for incomplete LU factorization preconditioners. */
+template<class NumericT>
+struct ilu
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_ilu";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(1024);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // only generate for floating points (forces error for integers)
+      if (numeric_string == "float" || numeric_string == "double")
+      {
+        generate_ilu_level_scheduling_substitute(source, numeric_string);
+
+        generate_icc_extract_L_1(source);
+        generate_icc_extract_L_2(source, numeric_string);
+        generate_icc_chow_patel_sweep_kernel(source, numeric_string);
+
+        generate_ilu_extract_LU_1(source);
+        generate_ilu_extract_LU_2(source, numeric_string);
+        generate_ilu_scale_kernel_1(source, numeric_string);
+        generate_ilu_scale_kernel_2(source, numeric_string);
+        generate_ilu_chow_patel_sweep_kernel(source, numeric_string);
+        generate_ilu_form_neumann_matrix_kernel(source, numeric_string);
+      }
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+


[35/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
new file mode 100644
index 0000000..912d24d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
@@ -0,0 +1,2725 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/cuda/matrix_operations.hpp
+    @brief Implementations of dense matrix related operations, including matrix-vector products, using CUDA.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+#include "viennacl/linalg/cuda/vector_operations.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_row.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_col.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(matrix_base<DestNumericT> & mat1, matrix_base<SrcNumericT> const & mat2)
+{
+  assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+  if (mat1.row_major())
+  {
+    convert_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                    viennacl::cuda_arg(mat2),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+                                  );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("convert_row_kernel");
+  }
+  else
+  {
+    convert_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                    viennacl::cuda_arg(mat2),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+                                  );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("convert_col_kernel");
+  }
+}
+
+template<typename NumericT, typename SizeT, typename DistanceT>
+void trans(matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>,const matrix_base<NumericT, SizeT, DistanceT>, op_trans> const & proxy,
+           matrix_base<NumericT> & temp_trans)
+{
+  trans_kernel<<<128,128>>>(viennacl::cuda_arg(proxy.lhs()),
+                            static_cast<unsigned int>(proxy.lhs().start1()),          static_cast<unsigned int>(proxy.lhs().start2()),
+                            static_cast<unsigned int>(proxy.lhs().internal_size1()),  static_cast<unsigned int>(proxy.lhs().internal_size2()),
+                            static_cast<unsigned int>(proxy.lhs().size1()),           static_cast<unsigned int>(proxy.lhs().size2()),
+                            static_cast<unsigned int>(proxy.lhs().stride1()),         static_cast<unsigned int>(proxy.lhs().stride2()),
+
+                            viennacl::cuda_arg(temp_trans),
+                            static_cast<unsigned int>(temp_trans.start1()),            static_cast<unsigned int>(temp_trans.start2()),
+                            static_cast<unsigned int>(temp_trans.internal_size1()),    static_cast<unsigned int>(temp_trans.internal_size2()),
+                            static_cast<unsigned int>(temp_trans.stride1()),           static_cast<unsigned int>(temp_trans.stride2()),
+                            static_cast<bool>(proxy.lhs().row_major()));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("trans_kernel");
+}
+
+
+template<typename NumericT, typename ScalarT>
+void am(matrix_base<NumericT> & mat1,
+        matrix_base<NumericT> const & mat2, ScalarT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+  assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarT>::value)
+    temporary_alpha = alpha;
+
+  if (mat1.row_major())
+  {
+    am_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                options_alpha,
+                                viennacl::cuda_arg(mat2),
+                                static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+                              );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("am_row_kernel");
+  }
+  else
+  {
+    am_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                options_alpha,
+                                viennacl::cuda_arg(mat2),
+                                static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+                              );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("am_col_kernel");
+  }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void ambm(matrix_base<NumericT> & mat1,
+          matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+          matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarT1>::value)
+    temporary_alpha = alpha;
+
+
+  unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  value_type temporary_beta = 0;
+  if (viennacl::is_cpu_scalar<ScalarT2>::value)
+    temporary_beta = beta;
+
+
+  if (mat1.row_major())
+  {
+    ambm_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                  static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                  static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                  options_alpha,
+                                  viennacl::cuda_arg(mat2),
+                                  static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                  static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+                                  viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                  options_beta,
+                                  viennacl::cuda_arg(mat3),
+                                  static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+                                  static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+                                );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_row_kernel");
+  }
+  else
+  {
+    ambm_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                  static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                  static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                  options_alpha,
+                                  viennacl::cuda_arg(mat2),
+                                  static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                  static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+                                  viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                  options_beta,
+                                  viennacl::cuda_arg(mat3),
+                                  static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+                                  static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+                                );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_col_kernel");
+  }
+
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void ambm_m(matrix_base<NumericT> & mat1,
+            matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+            matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarT1>::value)
+    temporary_alpha = alpha;
+
+
+  unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  value_type temporary_beta = 0;
+  if (viennacl::is_cpu_scalar<ScalarT2>::value)
+    temporary_beta = beta;
+
+
+  if (mat1.row_major())
+  {
+    ambm_m_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                    viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                    options_alpha,
+                                    viennacl::cuda_arg(mat2),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+                                    viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                    options_beta,
+                                    viennacl::cuda_arg(mat3),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+                                  );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_row_kernel");
+  }
+  else
+  {
+    ambm_m_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                    viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                    options_alpha,
+                                    viennacl::cuda_arg(mat2),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+                                    viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                    options_beta,
+                                    viennacl::cuda_arg(mat3),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+                                  );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_col_kernel");
+  }
+
+}
+
+
+
+
+template<typename NumericT>
+void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
+{
+  typedef NumericT        value_type;
+  value_type alpha = s;
+
+  unsigned int s1  = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat);
+  unsigned int s2  = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat);
+
+  if (mat.row_major())
+  {
+
+    matrix_row_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+                                           static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                           static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                           s1,                                                                 s2,
+                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                           alpha);
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_assign_kernel");
+  }
+  else
+  {
+    matrix_col_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+                                            static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                            static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                            s1,                                                                 s2,
+                                            static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                            alpha);
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_assign_kernel");
+  }
+}
+
+template<typename NumericT>
+void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
+{
+  typedef NumericT        value_type;
+  value_type alpha = s;
+
+  if (mat.row_major())
+  {
+    matrix_row_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+                                                    static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                    static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                    static_cast<unsigned int>(viennacl::traits::size1(mat)),            static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                    alpha);
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_diagonal_assign_kernel");
+  }
+  else
+  {
+    matrix_col_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+                                                    static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                    static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                    static_cast<unsigned int>(viennacl::traits::size1(mat)),            static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                    alpha);
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_diagonal_assign_kernel");
+  }
+}
+
+
+template<typename NumericT>
+void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT> & mat)
+{
+  typedef NumericT        value_type;
+
+  // Step 1: assign zero matrix:
+  matrix_assign(mat, NumericT(0));
+
+  // Step 2: Assign diagonal:
+  unsigned int options_alpha = 0;
+
+  vcl_size_t mat_start = 0;
+  vcl_size_t mat_stride = 0;
+  vcl_size_t mat_size = viennacl::traits::size(vec);
+  if (mat.row_major())
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    mat_start  =  (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+                 + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
+    mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
+  }
+  else
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    mat_start  =    viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+                 + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+    mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
+  }
+
+  av_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+                          static_cast<unsigned int>(mat_start),
+                          static_cast<unsigned int>(mat_stride),
+                          static_cast<unsigned int>(mat_size),
+
+                          viennacl::cuda_arg<value_type>(NumericT(1)),
+                          options_alpha,
+                          viennacl::cuda_arg(vec),
+                          static_cast<unsigned int>(viennacl::traits::start(vec)),
+                          static_cast<unsigned int>(viennacl::traits::stride(vec)) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+template<typename NumericT>
+void matrix_diag_to_vector(matrix_base<NumericT> const & mat, int k, vector_base<NumericT> & vec)
+{
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = 0;
+
+  vcl_size_t mat_start = 0;
+  vcl_size_t mat_stride = 0;
+  if (mat.row_major())
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    mat_start  =  (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+                 + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
+    mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
+  }
+  else
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    mat_start  =    viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+                 + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+    mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
+  }
+
+  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
+                          static_cast<unsigned int>(viennacl::traits::start(vec)),
+                          static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                          static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+                          viennacl::cuda_arg<value_type>(NumericT(1)),
+                          options_alpha,
+                          viennacl::cuda_arg(mat),
+                          static_cast<unsigned int>(mat_start),
+                          static_cast<unsigned int>(mat_stride));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+template<typename NumericT>
+void matrix_row(matrix_base<NumericT> const & mat, unsigned int i, vector_base<NumericT> & vec)
+{
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = 0;
+
+  vcl_size_t mat_start = 0;
+  vcl_size_t mat_stride = 0;
+  if (mat.row_major())
+  {
+    mat_start  = (viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat);
+    mat_stride = viennacl::traits::stride2(mat);
+  }
+  else
+  {
+    mat_start  = viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat);
+    mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat);
+  }
+
+  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
+                          static_cast<unsigned int>(viennacl::traits::start(vec)),
+                          static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                          static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+                          viennacl::cuda_arg<value_type>(NumericT(1)),
+                          options_alpha,
+                          viennacl::cuda_arg(mat),
+                          static_cast<unsigned int>(mat_start),
+                          static_cast<unsigned int>(mat_stride));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+template<typename NumericT>
+void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec)
+{
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = 0;
+
+  vcl_size_t mat_start = 0;
+  vcl_size_t mat_stride = 0;
+  if (mat.row_major())
+  {
+    mat_start  = viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat);
+    mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat);
+  }
+  else
+  {
+    mat_start  = viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+    mat_stride = viennacl::traits::stride2(mat);
+  }
+
+  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
+                          static_cast<unsigned int>(viennacl::traits::start(vec)),
+                          static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                          static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+                          viennacl::cuda_arg<value_type>(NumericT(1)),
+                          options_alpha,
+                          viennacl::cuda_arg(mat),
+                          static_cast<unsigned int>(mat_start),
+                          static_cast<unsigned int>(mat_stride));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+
+//
+/////////////////////////   binary element-wise operations    /////////////////////////////////
+//
+
+
+template<typename NumericT, typename SizeT, typename OpT>
+void element_op(matrix_base<NumericT, SizeT> & A,
+                matrix_expression<const matrix_base<NumericT, SizeT>, const matrix_base<NumericT, SizeT>, op_element_binary<OpT> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT      value_type;
+
+  unsigned int op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OpT>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OpT>::value)
+    op_type = 0;
+
+  if (A.row_major())
+  {
+    element_op_int_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                        viennacl::cuda_arg(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                        op_type
+                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+  }
+  else
+  {
+    element_op_int_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                        viennacl::cuda_arg(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                        op_type
+                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+  }
+}
+
+template<typename SizeT, typename OpT>
+void element_op(matrix_base<float, SizeT> & A,
+                matrix_expression<const matrix_base<float, SizeT>, const matrix_base<float, SizeT>, op_element_binary<OpT> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef float        value_type;
+
+  unsigned int op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OpT>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OpT>::value)
+    op_type = 0;
+
+  if (A.row_major())
+  {
+    element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                        viennacl::cuda_arg(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                        op_type
+                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+  }
+  else
+  {
+    element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                        viennacl::cuda_arg(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                        op_type
+                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+  }
+}
+
+template<typename SizeT, typename OpT>
+void element_op(matrix_base<double, SizeT> & A,
+                matrix_expression<const matrix_base<double, SizeT>, const matrix_base<double, SizeT>, op_element_binary<OpT> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef double        value_type;
+
+  unsigned int op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OpT>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OpT>::value)
+    op_type = 0;
+
+  if (A.row_major())
+  {
+    element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                        viennacl::cuda_arg(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                        op_type
+                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+  }
+  else
+  {
+    element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                        viennacl::cuda_arg(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                        op_type
+                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+  }
+}
+
+//
+/////////////////////////   unary element-wise operations    /////////////////////////////////
+//
+
+// Note: Due to CUDA vs C-proprocessor interference (concatenation seems to be broken in at least CUDA 4.2),
+//       we could not find a more 'automatic' way of generating the overloads below...
+
+// abs
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_abs> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_abs_kernel");
+  }
+  else
+  {
+    matrix_col_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_abs_kernel");
+  }
+}
+
+
+// acos
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_acos> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT    value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_acos_kernel");
+  }
+  else
+  {
+    matrix_col_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_acos_kernel");
+  }
+}
+
+
+// asin
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_asin> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT    value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_asin_kernel");
+  }
+  else
+  {
+    matrix_col_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
+  }
+}
+
+
+// atan
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_atan> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_atan_kernel");
+  }
+  else
+  {
+    matrix_col_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_atan_kernel");
+  }
+}
+
+
+// ceil
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_ceil> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_ceil_kernel");
+  }
+  else
+  {
+    matrix_col_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_ceil_kernel");
+  }
+}
+
+
+// cos
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_cos> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cos_kernel");
+  }
+  else
+  {
+    matrix_col_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cos_kernel");
+  }
+}
+
+
+// cosh
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_cosh> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT  value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cosh_kernel");
+  }
+  else
+  {
+    matrix_col_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cosh_kernel");
+  }
+}
+
+
+// exp
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_exp> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT  value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_exp_kernel");
+  }
+  else
+  {
+    matrix_col_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_exp_kernel");
+  }
+}
+
+
+// fabs
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_fabs> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_fabs_kernel");
+  }
+  else
+  {
+    matrix_col_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_fabs_kernel");
+  }
+}
+
+
+// floor
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_floor> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT    value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_floor_kernel");
+  }
+  else
+  {
+    matrix_col_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_floor_kernel");
+  }
+}
+
+
+// log
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_log> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT  value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log_kernel");
+  }
+  else
+  {
+    matrix_col_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log_kernel");
+  }
+}
+
+
+// log10
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_log10> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log10_kernel");
+  }
+  else
+  {
+    matrix_col_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log10_kernel");
+  }
+}
+
+
+// sin
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sin> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT  value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sin_kernel");
+  }
+  else
+  {
+    matrix_col_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
+  }
+}
+
+
+// sinh
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sinh> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sinh_kernel");
+  }
+  else
+  {
+    matrix_col_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sinh_kernel");
+  }
+}
+
+
+// sqrt
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sqrt> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned i

<TRUNCATED>

[34/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_col.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_col.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_col.hpp
new file mode 100644
index 0000000..44684ce
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_col.hpp
@@ -0,0 +1,1847 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/cuda/matrix_operations_col.hpp
+    @brief Implementations of column-major dense matrix related operations, including matrix-vector products, using CUDA.
+*/
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+template<typename DestNumericT, typename SrcNumericT>
+__global__ void convert_col_kernel(DestNumericT * A,
+                                  unsigned int A_start1, unsigned int A_start2,
+                                  unsigned int A_inc1,   unsigned int A_inc2,
+                                  unsigned int A_size1,  unsigned int A_size2,
+                                  unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                                  const SrcNumericT * B,
+                                  unsigned int B_start1, unsigned int B_start2,
+                                  unsigned int B_inc1,   unsigned int B_inc2,
+                                  unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1];
+}
+
+//
+// am
+//
+
+// alpha on CPU
+template<typename NumericT>
+__global__ void am_col_kernel(NumericT * A,
+                              unsigned int A_start1, unsigned int A_start2,
+                              unsigned int A_inc1,   unsigned int A_inc2,
+                              unsigned int A_size1,  unsigned int A_size2,
+                              unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                              NumericT fac2,
+                              unsigned int options2,
+                              const NumericT * B,
+                              unsigned int B_start1, unsigned int B_start2,
+                              unsigned int B_inc1,   unsigned int B_inc2,
+                              unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
+  }
+  else
+  {
+    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
+  }
+}
+
+// alpha on GPU
+template<typename NumericT>
+__global__ void am_col_kernel(NumericT * A,
+                              unsigned int A_start1, unsigned int A_start2,
+                              unsigned int A_inc1,   unsigned int A_inc2,
+                              unsigned int A_size1,  unsigned int A_size2,
+                              unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                              const NumericT * fac2,
+                              unsigned int options2,
+                              const NumericT * B,
+                              unsigned int B_start1, unsigned int B_start2,
+                              unsigned int B_inc1,   unsigned int B_inc2,
+                              unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
+  }
+  else
+  {
+    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
+  }
+}
+
+
+//
+// ambm
+//
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void ambm_col_kernel(NumericT * A,
+                                unsigned int A_start1, unsigned int A_start2,
+                                unsigned int A_inc1,   unsigned int A_inc2,
+                                unsigned int A_size1,  unsigned int A_size2,
+                                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                                NumericT fac2,
+                                unsigned int options2,
+                                const NumericT * B,
+                                unsigned int B_start1, unsigned int B_start2,
+                                unsigned int B_inc1,   unsigned int B_inc2,
+                                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                                NumericT fac3,
+                                unsigned int options3,
+                                const NumericT * C,
+                                unsigned int C_start1, unsigned int C_start2,
+                                unsigned int C_inc1,   unsigned int C_inc2,
+                                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+}
+
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void ambm_col_kernel(NumericT * A,
+                                unsigned int A_start1, unsigned int A_start2,
+                                unsigned int A_inc1,   unsigned int A_inc2,
+                                unsigned int A_size1,  unsigned int A_size2,
+                                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                                NumericT fac2,
+                                unsigned int options2,
+                                const NumericT * B,
+                                unsigned int B_start1, unsigned int B_start2,
+                                unsigned int B_inc1,   unsigned int B_inc2,
+                                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                                const NumericT * fac3,
+                                unsigned int options3,
+                                const NumericT * C,
+                                unsigned int C_start1, unsigned int C_start2,
+                                unsigned int C_inc1,   unsigned int C_inc2,
+                                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void ambm_col_kernel(NumericT * A,
+                                unsigned int A_start1, unsigned int A_start2,
+                                unsigned int A_inc1,   unsigned int A_inc2,
+                                unsigned int A_size1,  unsigned int A_size2,
+                                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                                const NumericT * fac2,
+                                unsigned int options2,
+                                const NumericT * B,
+                                unsigned int B_start1, unsigned int B_start2,
+                                unsigned int B_inc1,   unsigned int B_inc2,
+                                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                                NumericT fac3,
+                                unsigned int options3,
+                                const NumericT * C,
+                                unsigned int C_start1, unsigned int C_start2,
+                                unsigned int C_inc1,   unsigned int C_inc2,
+                                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+}
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void ambm_col_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+}
+
+
+//
+// ambm_m
+//
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void ambm_m_col_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          NumericT fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+       += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+       += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+       += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+       += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+}
+
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void ambm_m_col_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void ambm_m_col_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          NumericT fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+}
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void ambm_m_col_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+    }
+    else
+    {
+      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+    }
+  }
+}
+
+
+
+//
+// assignments
+//
+
+template<typename NumericT>
+__global__ void matrix_col_assign_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+          NumericT alpha)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha;
+}
+
+
+template<typename NumericT>
+__global__ void matrix_col_diagonal_assign_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+          NumericT alpha)
+{
+  unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+
+  for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
+    A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1] = alpha;
+}
+
+//
+// binary element-wise operations
+//
+
+template<typename NumericT>
+__global__ void element_op_col_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2,
+
+          unsigned int op_type) //0: product, 1: division, 2: pow
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+  if (op_type == 2)
+  {
+    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+      = pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1],
+            C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]);
+  }
+  else if (op_type == 1)
+  {
+    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+      = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
+      / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
+  }
+  else if (op_type == 0)
+  {
+    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+      = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
+      * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
+  }
+}
+
+template<typename NumericT>
+__global__ void element_op_int_col_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2,
+
+          unsigned int op_type) //0: product, 1: division, 2: pow
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+  if (op_type == 1)
+  {
+    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+      = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
+      / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
+  }
+  else if (op_type == 0)
+  {
+    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+      = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
+      * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
+  }
+}
+
+
+//
+// unary element-wise operations
+//
+
+// abs
+template<typename NumericT>
+__global__ void matrix_col_element_abs_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = abs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// acos
+template<typename NumericT>
+__global__ void matrix_col_element_acos_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = acos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// asin
+template<typename NumericT>
+__global__ void matrix_col_element_asin_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = asin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// atan
+template<typename NumericT>
+__global__ void matrix_col_element_atan_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = atan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// ceil
+template<typename NumericT>
+__global__ void matrix_col_element_ceil_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = ceil(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// cos
+template<typename NumericT>
+__global__ void matrix_col_element_cos_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// cosh
+template<typename NumericT>
+__global__ void matrix_col_element_cosh_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cosh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// exp
+template<typename NumericT>
+__global__ void matrix_col_element_exp_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = exp(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// fabs
+template<typename NumericT>
+__global__ void matrix_col_element_fabs_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = fabs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// floor
+template<typename NumericT>
+__global__ void matrix_col_element_floor_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = floor(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// log
+template<typename NumericT>
+__global__ void matrix_col_element_log_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// log10
+template<typename NumericT>
+__global__ void matrix_col_element_log10_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log10(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// sin
+template<typename NumericT>
+__global__ void matrix_col_element_sin_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// sinh
+template<typename NumericT>
+__global__ void matrix_col_element_sinh_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sinh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// sqrt
+template<typename NumericT>
+__global__ void matrix_col_element_sqrt_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sqrt(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// tan
+template<typename NumericT>
+__global__ void matrix_col_element_tan_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// tanh
+template<typename NumericT>
+__global__ void matrix_col_element_tanh_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tanh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+
+//
+// matrix-vector product
+//
+
+template<typename NumericT>
+__global__ void vec_mul_col_kernel(
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * v,
+          unsigned int v_start,
+          unsigned int v_inc,
+          unsigned int v_size,
+          NumericT * result,
+          unsigned int result_start,
+          unsigned int result_inc,
+          unsigned int result_size)
+{
+
+  for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_row_size; row += gridDim.x * blockDim.x)
+  {
+    NumericT dot_prod = 0;
+    for (unsigned int col = 0; col < A_col_size; ++col)
+      dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col];
+    result[row * result_inc + result_start] = dot_prod;
+  }
+}
+
+
+template<typename NumericT>
+__global__ void trans_vec_mul_col_kernel(
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * v,
+          unsigned int v_start,
+          unsigned int v_inc,
+          unsigned int v_size,
+          NumericT * result,
+          unsigned int result_start,
+          unsigned int result_inc,
+          unsigned int result_size)
+{
+  __shared__ NumericT work[128];
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int lid = threadIdx.x;
+
+  for (unsigned int row = row_gid; row < A_col_size; row += gridDim.x)
+  {
+    NumericT dot_prod = 0;
+    for (unsigned int col = col_gid; col < A_row_size; col += blockDim.x)
+      dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col];
+    work[lid] = dot_prod;
+
+    for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){
+      __syncthreads();
+      if (lid < stride)
+        work[lid] += work[lid+stride];
+    }
+
+    if (lid == 0)
+      result[row * result_inc + result_start] = work[0];
+  }
+}
+
+
+//
+// matrix-matrix products
+//
+
+
+
+
+//
+// scaled rank-1-update
+//
+
+// alpha on CPU
+template<typename NumericT>
+__global__ void scaled_rank1_update_col_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT val,
+          unsigned int options2,
+
+          const NumericT * vec1,
+          unsigned int start1,
+          unsigned int inc1,
+          unsigned int size1,
+
+          const NumericT * vec2,
+          unsigned int start2,
+          unsigned int inc2,
+          unsigned int size2)
+{
+  NumericT alpha = val;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+  if (options2 & (1 << 1))
+    alpha = NumericT(1) / alpha;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+  {
+    NumericT tmp = alpha * vec1[row * inc1 + start1];
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
+  }
+}
+
+
+// alpha on GPU
+template<typename NumericT>
+__global__ void scaled_rank1_update_col_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * val,
+          unsigned int options2,
+
+          const NumericT * vec1,
+          unsigned int start1,
+          unsigned int inc1,
+          unsigned int size1,
+
+          const NumericT * vec2,
+          unsigned int start2,
+          unsigned int inc2,
+          unsigned int size2)
+{
+  NumericT alpha = *val;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+  if (options2 & (1 << 1))
+    alpha = NumericT(1) / alpha;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+  {
+    NumericT tmp = alpha * vec1[row * inc1 + start1];
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
+  }
+}
+
+
+template <typename T>
+__global__ void bidiag_pack_row_major_kernel(
+            T * A,
+            T * D,
+            T * S,
+            unsigned int size1,
+            unsigned int size2,
+            unsigned int stride)
+{
+  unsigned int size = min(size1, size2);
+  if(blockIdx.x * blockDim.x + threadIdx.x == 0)
+    S[0] = 0;
+
+  for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+           i < size;
+           i += gridDim.x * blockDim.x)
+    {
+      D[i] = A[i*stride + i];
+      S[i+1] = (i + 1 < size2) ? A[i*stride + (i + 1)] : 0;
+    }
+}
+
+template <typename T>
+__global__ void bidiag_pack_column_major_kernel(
+            T * A,
+            T * D,
+            T * S,
+            unsigned int size1,
+            unsigned int size2,
+            unsigned int stride)
+{
+  unsigned int size = min(size1, size2);
+  if(blockIdx.x * blockDim.x + threadIdx.x == 0)
+    S[0] = 0;
+
+  for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+           i < size;
+           i += gridDim.x * blockDim.x)
+    {
+      D[i] = A[i*stride + i];
+      S[i+1] = (i + 1 < size2) ? A[i + (i + 1) * stride] : 0;
+    }
+}
+
+
+
+template<typename T>
+__global__ void copy_col_row_major_kernel(
+        T * A,
+        T * V,
+        unsigned int row_start,
+        unsigned int col_start,
+        unsigned int size,
+        unsigned int stride)
+{
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int sz = gridDim.x * blockDim.x;
+
+    for(unsigned int i = row_start + x; i < size; i += sz)
+    {
+        V[i - row_start] = A[i * stride + col_start];
+    }
+}
+
+template<typename T>
+__global__ void copy_col_column_major_kernel(
+        T * A,
+        T * V,
+        unsigned int row_start,
+        unsigned int col_start,
+        unsigned int size,
+        unsigned int stride)
+{
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int sz = gridDim.x * blockDim.x;
+
+    for(unsigned int i = row_start + x; i < size; i += sz)
+    {
+        V[i - row_start] = A[i + col_start * stride];
+    }
+}
+
+template<typename T>
+__global__ void copy_row_row_major_kernel(
+        T * A,
+        T * V,
+        unsigned int row_start,
+        unsigned int col_start,
+        unsigned int size,
+        unsigned int stride)
+{
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int sz = gridDim.x * blockDim.x;
+
+    for(unsigned int i = col_start + x; i < size; i += sz)
+    {
+        V[i - col_start] = A[row_start * stride + i];
+    }
+
+}
+
+template<typename T>
+__global__ void copy_row_column_major_kernel(
+        T * A,
+        T * V,
+        unsigned int row_start,
+        unsigned int col_start,
+        unsigned int size,
+        unsigned int stride)
+{
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int sz = gridDim.x * blockDim.x;
+
+    for(unsigned int i = col_start + x; i < size; i += sz)
+    {
+        V[i - col_start] = A[row_start + i * stride];
+    }
+
+}
+
+
+
+template<typename T>
+__global__ void house_update_A_left_row_major_kernel(
+        T * A,
+        T * V,        //householder vector
+        unsigned int row_start,
+        unsigned int col_start,
+        unsigned int size1,
+        unsigned int size2,
+        unsigned int stride)
+{
+    T ss = 0;
+
+    for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
+        i < size2;
+        i += gridDim.x * blockDim.x)
+    {
+        ss = 0;
+        for(unsigned int j = row_start; j < size1; j++)
+            ss = ss +(V[j] * A[j * stride + i]);
+
+        for(unsigned int j = row_start; j < size1; j++)
+            A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss);
+    }
+}
+
+template<typename T>
+__global__ void house_update_A_left_column_major_kernel(
+        T * A,
+        T * V,        //householder vector
+        unsigned int row_start,
+        unsigned int col_start,
+        unsigned int size1,
+        unsigned int size2,
+        unsigned int stride)
+{
+    T ss = 0;
+
+    for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
+        i < size2;
+        i += gridDim.x * blockDim.x)
+    {
+        ss = 0;
+        for(unsigned int j = row_start; j < size1; j++)
+            ss = ss +(V[j] * A[j + i * stride]);
+
+        for(unsigned int j = row_start; j < size1; j++)
+            A[j + i * stride] = A[j + i * stride] - (2 * V[j] * ss);
+    }
+}
+
+
+
+template<typename T>
+__global__ void house_update_A_right_row_major_kernel(
+        T * A,
+        T * V,  //householder vector
+        unsigned int row_start,
+        unsigned int col_start,
+        unsigned int size1,
+        unsigned int size2,
+        unsigned int stride)
+{
+    __shared__ T sums[128];
+    T ss = 0;
+
+    for(unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
+    {
+        ss = 0;
+        for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
+            ss = ss + (V[j] * A[i * stride + j]);
+        sums[threadIdx.x] = ss;
+
+        __syncthreads();
+        col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
+        __syncthreads();
+
+        T sum_Av = sums[0];
+
+        for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
+            A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av);
+    }
+}
+
+template<typename T>
+__global__ void house_update_A_right_column_major_kernel(
+        T * A,
+        T * V,  //householder vector
+        unsigned int row_start,
+        unsigned int col_start,
+        unsigned int size1,
+        unsigned int size2,
+        unsigned int stride)
+{
+    __shared__ T sums[128];
+    T ss = 0;
+
+    for(unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
+    {
+        ss = 0;
+        for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
+            ss = ss + (V[j] * A[i + j * stride]);
+        sums[threadIdx.x] = ss;
+
+        __syncthreads();
+        col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
+        __syncthreads();
+
+        T sum_Av = sums[0];
+
+        for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
+            A[i + j * stride] = A[i + j * stride] - (2 * V[j] * sum_Av);
+    }
+}
+
+
+
+template<typename T>
+__device__ void col_reduce_lcl_array(
+        T * sums,
+        unsigned int th_Idx,
+        unsigned int bl_Dim)
+{
+    unsigned int step = bl_Dim >> 1;
+
+    while(step > 0)
+    {
+        if(th_Idx < step)
+            sums[th_Idx] += sums[th_Idx + step];
+        step >>= 1;
+        __syncthreads();
+    }
+}
+
+
+template <typename T>
+__global__ void house_update_QL_row_major_kernel(
+        T * QL,
+        T * V,
+        unsigned int size1,
+        unsigned int strideQ)
+{
+  __shared__ T sums[128];
+  T ss = 0;
+  for(unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
+  {
+    ss = 0;
+    for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
+      ss = ss + (V[j] * QL[i * strideQ + j]);
+    sums[threadIdx.x] = ss;
+
+    __syncthreads();
+    col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
+    __syncthreads();
+
+    T sum_Qv = sums[0];
+
+    for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
+      QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv);
+  }
+}
+
+template <typename T>
+__global__ void house_update_QL_column_major_kernel(
+        T * QL,
+        T * V,
+        unsigned int size1,
+        unsigned int strideQ)
+{
+  __shared__ T sums[128];
+  T ss = 0;
+  for(unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
+  {
+    ss = 0;
+    for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
+      ss = ss + (V[j] * QL[i + j * strideQ]);
+    sums[threadIdx.x] = ss;
+
+    __syncthreads();
+    col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
+    __syncthreads();
+
+    T sum_Qv = sums[0];
+
+    for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
+      QL[i + j * strideQ] = QL[i + j * strideQ] - (2 * V[j] * sum_Qv);
+  }
+}
+
+
+template <typename T>
+__global__ void givens_next_row_major_kernel(
+        T * matr,
+        T * cs,
+        T * ss,
+        unsigned int size,
+        unsigned int stride,
+        unsigned int start_i,
+        unsigned int end_i)
+{
+    unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
+    __shared__ T cs_lcl[256];
+    __shared__ T ss_lcl[256];
+
+    T x = (j < size) ? matr[(end_i + 1) + j * stride] : 0;
+
+    unsigned int elems_num = end_i - start_i + 1;
+    unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
+
+    for(unsigned int block_id = 0; block_id < block_num; block_id++)
+    {
+        unsigned int to = min(elems_num - block_id * blockDim.x, blockDim.x);
+
+        if(threadIdx.x < to)
+        {
+            cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
+            ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
+        }
+        __syncthreads();
+        if(j < size)
+        {
+            for(unsigned int ind = 0; ind < to; ind++)
+            {
+                unsigned int i = end_i - (ind + block_id * blockDim.x);
+                T z = matr[i + j * stride];
+                T cs_val = cs_lcl[ind];
+                T ss_val = ss_lcl[ind];
+                matr[(i + 1) + j * stride] = x * cs_val + z * ss_val;
+                x = -x * ss_val + z * cs_val;
+            }
+        }
+        __syncthreads();
+     }
+     if(j < size)
+       matr[(start_i) + j * stride] = x;
+}
+
+template <typename T>
+__global__ void givens_next_column_major_kernel(
+        T * matr,
+        T * cs,
+        T * ss,
+        unsigned int size,
+        unsigned int stride,
+        unsigned int start_i,
+        unsigned int end_i)
+{
+    unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
+    __shared__ T cs_lcl[256];
+    __shared__ T ss_lcl[256];
+
+    T x = (j < size) ? matr[(end_i + 1) *stride + j] : 0;
+
+    unsigned int elems_num = end_i - start_i + 1;
+    unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
+
+    for(unsigned int block_id = 0; block_id < block_num; block_id++)
+    {
+        unsigned int to = min(elems_num - block_id * blockDim.x, blockDim.x);
+
+        if(threadIdx.x < to)
+        {
+            cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
+            ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
+        }
+        __syncthreads();
+        if(j < size)
+        {
+            for(unsigned int ind = 0; ind < to; ind++)
+            {
+                unsigned int i = end_i - (ind + block_id * blockDim.x);
+                T z = matr[i *stride + j];
+                T cs_val = cs_lcl[ind];
+                T ss_val = ss_lcl[ind];
+                matr[(i + 1) * stride + j] = x * cs_val + z * ss_val;
+                x = -x * ss_val + z * cs_val;
+            }
+        }
+        __syncthreads();
+     }
+     if(j < size)
+       matr[(start_i) * stride + j] = x;
+}
+
+
+
+
+
+} // namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif


[30/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp
new file mode 100644
index 0000000..24bcf96
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp
@@ -0,0 +1,761 @@
+#ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
+#define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp
+    @brief Implementations of direct triangular solvers for sparse matrices using CUDA
+*/
+
+#include "viennacl/forwards.h"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+//
+// Compressed matrix
+//
+
+//
+// non-transposed
+//
+
+template<typename NumericT>
+__global__ void csr_unit_lu_forward_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+                NumericT * vector,
+          unsigned int size)
+{
+  __shared__  unsigned int col_index_buffer[128];
+  __shared__  NumericT element_buffer[128];
+  __shared__  NumericT vector_buffer[128];
+
+  unsigned int nnz = row_indices[size];
+  unsigned int current_row = 0;
+  unsigned int row_at_window_start = 0;
+  NumericT current_vector_entry = vector[0];
+  unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
+  unsigned int next_row = row_indices[1];
+
+  for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
+  {
+    //load into shared memory (coalesced access):
+    if (i < nnz)
+    {
+      element_buffer[threadIdx.x] = elements[i];
+      unsigned int tmp = column_indices[i];
+      col_index_buffer[threadIdx.x] = tmp;
+      vector_buffer[threadIdx.x] = vector[tmp];
+    }
+
+    __syncthreads();
+
+    //now a single thread does the remaining work in shared memory:
+    if (threadIdx.x == 0)
+    {
+      // traverse through all the loaded data:
+      for (unsigned int k=0; k<blockDim.x; ++k)
+      {
+        if (current_row < size && i+k == next_row) //current row is finished. Write back result
+        {
+          vector[current_row] = current_vector_entry;
+          ++current_row;
+          if (current_row < size) //load next row's data
+          {
+            next_row = row_indices[current_row+1];
+            current_vector_entry = vector[current_row];
+          }
+        }
+
+        if (current_row < size && col_index_buffer[k] < current_row) //substitute
+        {
+          if (col_index_buffer[k] < row_at_window_start) //use recently computed results
+            current_vector_entry -= element_buffer[k] * vector_buffer[k];
+          else if (col_index_buffer[k] < current_row) //use buffered data
+            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
+        }
+
+      } // for k
+
+      row_at_window_start = current_row;
+    } // if (get_local_id(0) == 0)
+
+    __syncthreads();
+  } //for i
+}
+
+
+
+template<typename NumericT>
+__global__ void csr_lu_forward_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+                NumericT * vector,
+          unsigned int size)
+{
+  __shared__  unsigned int col_index_buffer[128];
+  __shared__  NumericT element_buffer[128];
+  __shared__  NumericT vector_buffer[128];
+
+  unsigned int nnz = row_indices[size];
+  unsigned int current_row = 0;
+  unsigned int row_at_window_start = 0;
+  NumericT current_vector_entry = vector[0];
+  NumericT diagonal_entry = 0;
+  unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
+  unsigned int next_row = row_indices[1];
+
+  for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
+  {
+    //load into shared memory (coalesced access):
+    if (i < nnz)
+    {
+      element_buffer[threadIdx.x] = elements[i];
+      unsigned int tmp = column_indices[i];
+      col_index_buffer[threadIdx.x] = tmp;
+      vector_buffer[threadIdx.x] = vector[tmp];
+    }
+
+    __syncthreads();
+
+    //now a single thread does the remaining work in shared memory:
+    if (threadIdx.x == 0)
+    {
+      // traverse through all the loaded data:
+      for (unsigned int k=0; k<blockDim.x; ++k)
+      {
+        if (current_row < size && i+k == next_row) //current row is finished. Write back result
+        {
+          vector[current_row] = current_vector_entry / diagonal_entry;
+          ++current_row;
+          if (current_row < size) //load next row's data
+          {
+            next_row = row_indices[current_row+1];
+            current_vector_entry = vector[current_row];
+          }
+        }
+
+        if (current_row < size && col_index_buffer[k] < current_row) //substitute
+        {
+          if (col_index_buffer[k] < row_at_window_start) //use recently computed results
+            current_vector_entry -= element_buffer[k] * vector_buffer[k];
+          else if (col_index_buffer[k] < current_row) //use buffered data
+            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
+        }
+        else if (col_index_buffer[k] == current_row)
+          diagonal_entry = element_buffer[k];
+
+      } // for k
+
+      row_at_window_start = current_row;
+    } // if (get_local_id(0) == 0)
+
+    __syncthreads();
+  } //for i
+}
+
+
+template<typename NumericT>
+__global__ void csr_unit_lu_backward_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+                NumericT * vector,
+          unsigned int size)
+{
+  __shared__  unsigned int col_index_buffer[128];
+  __shared__  NumericT element_buffer[128];
+  __shared__  NumericT vector_buffer[128];
+
+  unsigned int nnz = row_indices[size];
+  unsigned int current_row = size-1;
+  unsigned int row_at_window_start = size-1;
+  NumericT current_vector_entry = vector[size-1];
+  unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
+  unsigned int next_row = row_indices[size-1];
+
+  unsigned int i = loop_end + threadIdx.x;
+  while (1)
+  {
+    //load into shared memory (coalesced access):
+    if (i < nnz)
+    {
+      element_buffer[threadIdx.x] = elements[i];
+      unsigned int tmp = column_indices[i];
+      col_index_buffer[threadIdx.x] = tmp;
+      vector_buffer[threadIdx.x] = vector[tmp];
+    }
+
+    __syncthreads();
+
+    //now a single thread does the remaining work in shared memory:
+    if (threadIdx.x == 0)
+    {
+      // traverse through all the loaded data from back to front:
+      for (unsigned int k2=0; k2<blockDim.x; ++k2)
+      {
+        unsigned int k = (blockDim.x - k2) - 1;
+
+        if (i+k >= nnz)
+          continue;
+
+        if (col_index_buffer[k] > row_at_window_start) //use recently computed results
+          current_vector_entry -= element_buffer[k] * vector_buffer[k];
+        else if (col_index_buffer[k] > current_row) //use buffered data
+          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
+
+        if (i+k == next_row) //current row is finished. Write back result
+        {
+          vector[current_row] = current_vector_entry;
+          if (current_row > 0) //load next row's data
+          {
+            --current_row;
+            next_row = row_indices[current_row];
+            current_vector_entry = vector[current_row];
+          }
+        }
+
+
+      } // for k
+
+      row_at_window_start = current_row;
+    } // if (get_local_id(0) == 0)
+
+    __syncthreads();
+
+    if (i < blockDim.x)
+      break;
+
+    i -= blockDim.x;
+  } //for i
+}
+
+
+
+template<typename NumericT>
+__global__ void csr_lu_backward_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+                NumericT * vector,
+          unsigned int size)
+{
+  __shared__  unsigned int col_index_buffer[128];
+  __shared__  NumericT element_buffer[128];
+  __shared__  NumericT vector_buffer[128];
+
+  unsigned int nnz = row_indices[size];
+  unsigned int current_row = size-1;
+  unsigned int row_at_window_start = size-1;
+  NumericT current_vector_entry = vector[size-1];
+  NumericT diagonal_entry;
+  unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
+  unsigned int next_row = row_indices[size-1];
+
+  unsigned int i = loop_end + threadIdx.x;
+  while (1)
+  {
+    //load into shared memory (coalesced access):
+    if (i < nnz)
+    {
+      element_buffer[threadIdx.x] = elements[i];
+      unsigned int tmp = column_indices[i];
+      col_index_buffer[threadIdx.x] = tmp;
+      vector_buffer[threadIdx.x] = vector[tmp];
+    }
+
+    __syncthreads();
+
+    //now a single thread does the remaining work in shared memory:
+    if (threadIdx.x == 0)
+    {
+      // traverse through all the loaded data from back to front:
+      for (unsigned int k2=0; k2<blockDim.x; ++k2)
+      {
+        unsigned int k = (blockDim.x - k2) - 1;
+
+        if (i+k >= nnz)
+          continue;
+
+        if (col_index_buffer[k] > row_at_window_start) //use recently computed results
+          current_vector_entry -= element_buffer[k] * vector_buffer[k];
+        else if (col_index_buffer[k] > current_row) //use buffered data
+          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
+        else if (col_index_buffer[k] == current_row)
+          diagonal_entry = element_buffer[k];
+
+        if (i+k == next_row) //current row is finished. Write back result
+        {
+          vector[current_row] = current_vector_entry / diagonal_entry;
+          if (current_row > 0) //load next row's data
+          {
+            --current_row;
+            next_row = row_indices[current_row];
+            current_vector_entry = vector[current_row];
+          }
+        }
+
+
+      } // for k
+
+      row_at_window_start = current_row;
+    } // if (get_local_id(0) == 0)
+
+    __syncthreads();
+
+    if (i < blockDim.x)
+      break;
+
+    i -= blockDim.x;
+  } //for i
+}
+
+
+
+//
+// transposed
+//
+
+
+template<typename NumericT>
+__global__ void csr_trans_lu_forward_kernel2(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+                NumericT * vector,
+          unsigned int size)
+{
+  for (unsigned int row = 0; row < size; ++row)
+  {
+    NumericT result_entry = vector[row];
+
+    unsigned int row_start = row_indices[row];
+    unsigned int row_stop  = row_indices[row + 1];
+    for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; entry_index += blockDim.x)
+    {
+      unsigned int col_index = column_indices[entry_index];
+      if (col_index > row)
+        vector[col_index] -= result_entry * elements[entry_index];
+    }
+
+    __syncthreads();
+  }
+}
+
+template<typename NumericT>
+__global__ void csr_trans_unit_lu_forward_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+                NumericT * vector,
+          unsigned int size)
+{
+  __shared__  unsigned int row_index_lookahead[256];
+  __shared__  unsigned int row_index_buffer[256];
+
+  unsigned int row_index;
+  unsigned int col_index;
+  NumericT matrix_entry;
+  unsigned int nnz = row_indices[size];
+  unsigned int row_at_window_start = 0;
+  unsigned int row_at_window_end = 0;
+  unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
+
+  for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
+  {
+    col_index    = (i < nnz) ? column_indices[i] : 0;
+    matrix_entry = (i < nnz) ? elements[i]       : 0;
+    row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
+
+    __syncthreads();
+
+    if (i < nnz)
+    {
+      unsigned int row_index_inc = 0;
+      while (i >= row_index_lookahead[row_index_inc + 1])
+        ++row_index_inc;
+      row_index = row_at_window_start + row_index_inc;
+      row_index_buffer[threadIdx.x] = row_index;
+    }
+    else
+    {
+      row_index = size+1;
+      row_index_buffer[threadIdx.x] = size - 1;
+    }
+
+    __syncthreads();
+
+    row_at_window_start = row_index_buffer[0];
+    row_at_window_end   = row_index_buffer[blockDim.x - 1];
+
+    //forward elimination
+    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row)
+    {
+      NumericT result_entry = vector[row];
+
+      if ( (row_index == row) && (col_index > row) )
+        vector[col_index] -= result_entry * matrix_entry;
+
+      __syncthreads();
+    }
+
+    row_at_window_start = row_at_window_end;
+  }
+
+}
+
+template<typename NumericT>
+__global__ void csr_trans_lu_forward_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          const NumericT * diagonal_entries,
+                NumericT * vector,
+          unsigned int size)
+{
+  __shared__  unsigned int row_index_lookahead[256];
+  __shared__  unsigned int row_index_buffer[256];
+
+  unsigned int row_index;
+  unsigned int col_index;
+  NumericT matrix_entry;
+  unsigned int nnz = row_indices[size];
+  unsigned int row_at_window_start = 0;
+  unsigned int row_at_window_end = 0;
+  unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
+
+  for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
+  {
+    col_index    = (i < nnz) ? column_indices[i] : 0;
+    matrix_entry = (i < nnz) ? elements[i]       : 0;
+    row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
+
+    __syncthreads();
+
+    if (i < nnz)
+    {
+      unsigned int row_index_inc = 0;
+      while (i >= row_index_lookahead[row_index_inc + 1])
+        ++row_index_inc;
+      row_index = row_at_window_start + row_index_inc;
+      row_index_buffer[threadIdx.x] = row_index;
+    }
+    else
+    {
+      row_index = size+1;
+      row_index_buffer[threadIdx.x] = size - 1;
+    }
+
+    __syncthreads();
+
+    row_at_window_start = row_index_buffer[0];
+    row_at_window_end   = row_index_buffer[blockDim.x - 1];
+
+    //forward elimination
+    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row)
+    {
+      NumericT result_entry = vector[row] / diagonal_entries[row];
+
+      if ( (row_index == row) && (col_index > row) )
+        vector[col_index] -= result_entry * matrix_entry;
+
+      __syncthreads();
+    }
+
+    row_at_window_start = row_at_window_end;
+  }
+
+  // final step: Divide vector by diagonal entries:
+  for (unsigned int i = threadIdx.x; i < size; i += blockDim.x)
+    vector[i] /= diagonal_entries[i];
+
+}
+
+
+template<typename NumericT>
+__global__ void csr_trans_unit_lu_backward_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+                NumericT * vector,
+          unsigned int size)
+{
+  __shared__  unsigned int row_index_lookahead[256];
+  __shared__  unsigned int row_index_buffer[256];
+
+  unsigned int row_index;
+  unsigned int col_index;
+  NumericT matrix_entry;
+  unsigned int nnz = row_indices[size];
+  unsigned int row_at_window_start = size;
+  unsigned int row_at_window_end;
+  unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
+
+  for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
+  {
+    unsigned int i = (nnz - i2) - 1;
+    col_index    = (i2 < nnz) ? column_indices[i] : 0;
+    matrix_entry = (i2 < nnz) ? elements[i]       : 0;
+    row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
+
+    __syncthreads();
+
+    if (i2 < nnz)
+    {
+      unsigned int row_index_dec = 0;
+      while (row_index_lookahead[row_index_dec] > i)
+        ++row_index_dec;
+      row_index = row_at_window_start - row_index_dec;
+      row_index_buffer[threadIdx.x] = row_index;
+    }
+    else
+    {
+      row_index = size+1;
+      row_index_buffer[threadIdx.x] = 0;
+    }
+
+    __syncthreads();
+
+    row_at_window_start = row_index_buffer[0];
+    row_at_window_end   = row_index_buffer[blockDim.x - 1];
+
+    //backward elimination
+    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
+    {
+      unsigned int row = row_at_window_start - row2;
+      NumericT result_entry = vector[row];
+
+      if ( (row_index == row) && (col_index < row) )
+        vector[col_index] -= result_entry * matrix_entry;
+
+      __syncthreads();
+    }
+
+    row_at_window_start = row_at_window_end;
+  }
+
+}
+
+
+
+template<typename NumericT>
+__global__ void csr_trans_lu_backward_kernel2(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          const NumericT * diagonal_entries,
+                NumericT * vector,
+          unsigned int size)
+{
+  NumericT result_entry = 0;
+
+  //backward elimination, using U and D:
+  for (unsigned int row2 = 0; row2 < size; ++row2)
+  {
+    unsigned int row = (size - row2) - 1;
+    result_entry = vector[row] / diagonal_entries[row];
+
+    unsigned int row_start = row_indices[row];
+    unsigned int row_stop  = row_indices[row + 1];
+    for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; ++entry_index)
+    {
+      unsigned int col_index = column_indices[entry_index];
+      if (col_index < row)
+        vector[col_index] -= result_entry * elements[entry_index];
+    }
+
+    __syncthreads();
+
+    if (threadIdx.x == 0)
+      vector[row] = result_entry;
+  }
+}
+
+
+template<typename NumericT>
+__global__ void csr_trans_lu_backward_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          const NumericT * diagonal_entries,
+                NumericT * vector,
+          unsigned int size)
+{
+  __shared__  unsigned int row_index_lookahead[256];
+  __shared__  unsigned int row_index_buffer[256];
+
+  unsigned int row_index;
+  unsigned int col_index;
+  NumericT matrix_entry;
+  unsigned int nnz = row_indices[size];
+  unsigned int row_at_window_start = size;
+  unsigned int row_at_window_end;
+  unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
+
+  for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
+  {
+    unsigned int i = (nnz - i2) - 1;
+    col_index    = (i2 < nnz) ? column_indices[i] : 0;
+    matrix_entry = (i2 < nnz) ? elements[i]       : 0;
+    row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
+
+    __syncthreads();
+
+    if (i2 < nnz)
+    {
+      unsigned int row_index_dec = 0;
+      while (row_index_lookahead[row_index_dec] > i)
+        ++row_index_dec;
+      row_index = row_at_window_start - row_index_dec;
+      row_index_buffer[threadIdx.x] = row_index;
+    }
+    else
+    {
+      row_index = size+1;
+      row_index_buffer[threadIdx.x] = 0;
+    }
+
+    __syncthreads();
+
+    row_at_window_start = row_index_buffer[0];
+    row_at_window_end   = row_index_buffer[blockDim.x - 1];
+
+    //backward elimination
+    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
+    {
+      unsigned int row = row_at_window_start - row2;
+      NumericT result_entry = vector[row] / diagonal_entries[row];
+
+      if ( (row_index == row) && (col_index < row) )
+        vector[col_index] -= result_entry * matrix_entry;
+
+      __syncthreads();
+    }
+
+    row_at_window_start = row_at_window_end;
+  }
+
+
+  // final step: Divide vector by diagonal entries:
+  for (unsigned int i = threadIdx.x; i < size; i += blockDim.x)
+    vector[i] /= diagonal_entries[i];
+
+}
+
+
+template<typename NumericT>
+__global__ void csr_block_trans_unit_lu_forward(
+          const unsigned int * row_jumper_L,      //L part (note that L is transposed in memory)
+          const unsigned int * column_indices_L,
+          const NumericT * elements_L,
+          const unsigned int * block_offsets,
+          NumericT * result,
+          unsigned int size)
+{
+  unsigned int col_start = block_offsets[2*blockIdx.x];
+  unsigned int col_stop  = block_offsets[2*blockIdx.x+1];
+  unsigned int row_start = row_jumper_L[col_start];
+  unsigned int row_stop;
+  NumericT result_entry = 0;
+
+  if (col_start >= col_stop)
+    return;
+
+  //forward elimination, using L:
+  for (unsigned int col = col_start; col < col_stop; ++col)
+  {
+    result_entry = result[col];
+    row_stop = row_jumper_L[col + 1];
+    for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
+      result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index];
+    row_start = row_stop; //for next iteration (avoid unnecessary loads from GPU RAM)
+    __syncthreads();
+  }
+
+}
+
+
+template<typename NumericT>
+__global__ void csr_block_trans_lu_backward(
+          const unsigned int * row_jumper_U,      //U part (note that U is transposed in memory)
+          const unsigned int * column_indices_U,
+          const NumericT * elements_U,
+          const NumericT * diagonal_U,
+          const unsigned int * block_offsets,
+          NumericT * result,
+          unsigned int size)
+{
+  unsigned int col_start = block_offsets[2*blockIdx.x];
+  unsigned int col_stop  = block_offsets[2*blockIdx.x+1];
+  unsigned int row_start;
+  unsigned int row_stop;
+  NumericT result_entry = 0;
+
+  if (col_start >= col_stop)
+    return;
+
+  //backward elimination, using U and diagonal_U
+  for (unsigned int iter = 0; iter < col_stop - col_start; ++iter)
+  {
+    unsigned int col = (col_stop - iter) - 1;
+    result_entry = result[col] / diagonal_U[col];
+    row_start = row_jumper_U[col];
+    row_stop  = row_jumper_U[col + 1];
+    for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
+      result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index];
+    __syncthreads();
+  }
+
+  //divide result vector by diagonal:
+  for (unsigned int col = col_start + threadIdx.x; col < col_stop; col += blockDim.x)
+    result[col] /= diagonal_U[col];
+}
+
+
+
+//
+// Coordinate Matrix
+//
+
+
+
+
+//
+// ELL Matrix
+//
+
+
+
+//
+// Hybrid Matrix
+//
+
+
+
+} // namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm.hpp
new file mode 100644
index 0000000..5551cda
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm.hpp
@@ -0,0 +1,793 @@
+#ifndef VIENNACL_LINALG_CUDA_SPGEMM_HPP_
+#define VIENNACL_LINALG_CUDA_SPGEMM_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/sparse_matrix_operations.hpp
+    @brief Implementations of operations using sparse matrices using CUDA
+*/
+
+#include <stdexcept>
+
+#include <thrust/scan.h>
+#include <thrust/device_ptr.h>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+#include "viennacl/tools/timer.hpp"
+
+#include "viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+/** @brief Loads a value from the specified address. With CUDA arch 3.5 and above the value is also stored in global constant memory for later reuse */
+template<typename NumericT>
+static inline __device__ NumericT load_and_cache(const NumericT *address)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return __ldg(address);
+#else
+  return *address;
+#endif
+}
+
+
+//
+// Stage 1: Obtain upper bound for number of elements per row in C:
+//
+template<typename IndexT>
+__device__ IndexT round_to_next_power_of_2(IndexT val)
+{
+  if (val > 32)
+    return 64; // just to indicate that we need to split/factor the matrix!
+  else if (val > 16)
+    return 32;
+  else if (val > 8)
+    return 16;
+  else if (val > 4)
+    return 8;
+  else if (val > 2)
+    return 4;
+  else if (val > 1)
+    return 2;
+  else
+    return 1;
+}
+
+template<typename IndexT>
+__global__ void compressed_matrix_gemm_stage_1(
+          const IndexT * A_row_indices,
+          const IndexT * A_col_indices,
+          IndexT A_size1,
+          const IndexT * B_row_indices,
+          IndexT *subwarpsize_per_group,
+          IndexT *max_nnz_row_A_per_group,
+          IndexT *max_nnz_row_B_per_group)
+{
+  unsigned int subwarpsize_in_thread = 0;
+  unsigned int max_nnz_row_A = 0;
+  unsigned int max_nnz_row_B = 0;
+
+  unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
+  unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
+
+  for (unsigned int row = rows_per_group * blockIdx.x + threadIdx.x; row < row_per_group_end; row += blockDim.x)
+  {
+    unsigned int A_row_start = A_row_indices[row];
+    unsigned int A_row_end   = A_row_indices[row+1];
+    unsigned int row_num = A_row_end - A_row_start;
+    if (row_num > 32) // too many rows in B need to be merged for a single pass
+    {
+      unsigned int subwarp_sqrt = (unsigned int)sqrt(double(row_num)) + 1;
+      subwarpsize_in_thread = max(subwarp_sqrt, subwarpsize_in_thread);
+    }
+    else
+      subwarpsize_in_thread = max(A_row_end - A_row_start, subwarpsize_in_thread);
+    max_nnz_row_A = max(max_nnz_row_A, row_num);
+    for (unsigned int j = A_row_start; j < A_row_end; ++j)
+    {
+      unsigned int col = A_col_indices[j];
+      unsigned int row_len_B = B_row_indices[col + 1] - B_row_indices[col];
+      max_nnz_row_B = max(row_len_B, max_nnz_row_B);
+    }
+  }
+
+  // reduction to obtain maximum in thread block
+  __shared__ unsigned int shared_subwarpsize[256];
+  __shared__ unsigned int shared_max_nnz_row_A[256];
+  __shared__ unsigned int shared_max_nnz_row_B[256];
+
+    shared_subwarpsize[threadIdx.x] = subwarpsize_in_thread;
+  shared_max_nnz_row_A[threadIdx.x] = max_nnz_row_A;
+  shared_max_nnz_row_B[threadIdx.x] = max_nnz_row_B;
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+        shared_subwarpsize[threadIdx.x] = max(  shared_subwarpsize[threadIdx.x],   shared_subwarpsize[threadIdx.x + stride]);
+      shared_max_nnz_row_A[threadIdx.x] = max(shared_max_nnz_row_A[threadIdx.x], shared_max_nnz_row_A[threadIdx.x + stride]);
+      shared_max_nnz_row_B[threadIdx.x] = max(shared_max_nnz_row_B[threadIdx.x], shared_max_nnz_row_B[threadIdx.x + stride]);
+    }
+  }
+
+  if (threadIdx.x == 0)
+  {
+      subwarpsize_per_group[blockIdx.x] = round_to_next_power_of_2(shared_subwarpsize[0]);
+    max_nnz_row_A_per_group[blockIdx.x] = shared_max_nnz_row_A[0];
+    max_nnz_row_B_per_group[blockIdx.x] = shared_max_nnz_row_B[0];
+  }
+}
+
+//
+// Stage 2: Determine sparsity pattern of C
+//
+inline __device__ unsigned int merge_subwarp_symbolic(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2, unsigned int subwarpsize)
+{
+  unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
+
+  unsigned int num_nnz = 0;
+  while (1)
+  {
+    // determine current minimum (warp shuffle)
+    unsigned int min_index = current_front_index;
+    for (unsigned int i = subwarpsize/2; i >= 1; i /= 2)
+      min_index = min(min_index, __shfl_xor((int)min_index, (int)i));
+
+    if (min_index == B_size2)
+      break;
+
+    // update front:
+    current_front_index = (current_front_index == min_index) ? ((++row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2)
+                                                             : current_front_index;
+    ++num_nnz;
+  }
+
+  return num_nnz;
+}
+
+inline __device__ unsigned int merge_subwarp_symbolic_double(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2,
+                                                             unsigned int *output_array, unsigned int id_in_warp, unsigned int subwarpsize)
+{
+  unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
+
+  unsigned int num_nnz = 0;
+  unsigned int index_buffer = 0;
+  unsigned int buffer_size = 0;
+  while (1)
+  {
+    // determine current minimum (warp shuffle)
+    unsigned int min_index = current_front_index;
+    for (unsigned int i = subwarpsize/2; i >= 1; i /= 2)
+      min_index = min(min_index, __shfl_xor((int)min_index, (int)i));
+
+    if (min_index == B_size2)
+      break;
+
+    // update front:
+    current_front_index = (current_front_index == min_index) ? ((++row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2)
+                                                             : current_front_index;
+
+    // write output
+    index_buffer = (id_in_warp == buffer_size) ? min_index : index_buffer;
+    ++buffer_size;
+
+    if (buffer_size == subwarpsize) // register buffer full?
+    {
+      output_array[id_in_warp] = index_buffer;
+      output_array += subwarpsize;
+      buffer_size = 0;
+    }
+
+    ++num_nnz;
+  }
+
+  // write remaining entries from register buffer:
+  if (id_in_warp < buffer_size)
+    output_array[id_in_warp] = index_buffer;
+
+  return num_nnz;
+}
+
+template<typename IndexT>
+__global__ void compressed_matrix_gemm_stage_2(
+          const IndexT * A_row_indices,
+          const IndexT * A_col_indices,
+          IndexT A_size1,
+          const IndexT * B_row_indices,
+          const IndexT * B_col_indices,
+          IndexT B_size2,
+          IndexT * C_row_indices,
+          unsigned int *subwarpsize_array,
+          unsigned int *max_row_size_A,
+          unsigned int *max_row_size_B,
+          unsigned int *scratchpad_offsets,
+          unsigned int *scratchpad_indices)
+{
+  unsigned int subwarpsize = subwarpsize_array[blockIdx.x];
+
+  unsigned int num_warps  =  blockDim.x / subwarpsize;
+  unsigned int warp_id    = threadIdx.x / subwarpsize;
+  unsigned int id_in_warp = threadIdx.x % subwarpsize;
+
+  unsigned int scratchpad_rowlength     = max_row_size_B[blockIdx.x] * subwarpsize;
+  unsigned int scratchpad_rows_per_warp = max_row_size_A[blockIdx.x] / subwarpsize + 1;
+  unsigned int *subwarp_scratchpad_start = scratchpad_indices + scratchpad_offsets[blockIdx.x] + warp_id * scratchpad_rows_per_warp * scratchpad_rowlength;
+
+  unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
+  unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
+
+  for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps)
+  {
+    unsigned int row_A_start = A_row_indices[row];
+    unsigned int row_A_end   = A_row_indices[row+1];
+
+    if (row_A_end - row_A_start > subwarpsize)
+    {
+      unsigned int final_merge_start = 0;
+      unsigned int final_merge_end   = 0;
+
+      // merge to temporary scratchpad memory:
+      unsigned int *subwarp_scratchpad = subwarp_scratchpad_start;
+      unsigned int iter = 0;
+      while (row_A_end > row_A_start)
+      {
+        unsigned int my_row_B = row_A_start + id_in_warp;
+        unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
+        unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
+        unsigned int row_B_end   = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
+
+        unsigned int nnz_in_merge = merge_subwarp_symbolic_double(row_B_start, row_B_end, B_col_indices, B_size2,
+                                                                  subwarp_scratchpad, id_in_warp, subwarpsize);
+
+        final_merge_start = (iter == id_in_warp) ? subwarp_scratchpad - scratchpad_indices : final_merge_start;
+        final_merge_end   = (iter == id_in_warp) ? final_merge_start + nnz_in_merge        : final_merge_end;
+        ++iter;
+
+        row_A_start += subwarpsize;
+        subwarp_scratchpad += scratchpad_rowlength; // write to next row in scratchpad
+      }
+
+      // final merge:
+      unsigned int num_nnz = merge_subwarp_symbolic(final_merge_start, final_merge_end, scratchpad_indices, B_size2, subwarpsize);
+
+      if (id_in_warp == 0)
+        C_row_indices[row] = num_nnz;
+    }
+    else
+    {
+      // single merge
+      unsigned int my_row_B = row_A_start + id_in_warp;
+      unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
+      unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
+      unsigned int row_B_end   = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
+
+      unsigned int num_nnz = merge_subwarp_symbolic(row_B_start, row_B_end, B_col_indices, B_size2, subwarpsize);
+
+      if (id_in_warp == 0)
+        C_row_indices[row] = num_nnz;
+    }
+  }
+
+}
+
+
+//
+// Stage 3: Fill C with values
+//
+template<typename NumericT>
+__device__ unsigned int merge_subwarp_numeric(NumericT scaling_factor,
+                                              unsigned int input_start, unsigned int input_end, const unsigned int *input_indices, const NumericT *input_values, unsigned int invalid_token,
+                                              unsigned int *output_indices, NumericT *output_values,
+                                              unsigned int id_in_warp, unsigned int subwarpsize)
+{
+  unsigned int current_front_index = (input_start < input_end) ? load_and_cache(input_indices + input_start) : invalid_token;
+  NumericT     current_front_value = (input_start < input_end) ? load_and_cache(input_values  + input_start) : 0;
+
+  unsigned int index_buffer = 0;
+  NumericT     value_buffer = 0;
+  unsigned int buffer_size = 0;
+  unsigned int nnz_written = 0;
+  while (1)
+  {
+    // determine current minimum:
+    unsigned int min_index = current_front_index;
+    for (unsigned int i = subwarpsize/2; i >= 1; i /= 2)
+      min_index = min(min_index, __shfl_xor((int)min_index, (int)i));
+
+    if (min_index == invalid_token) // done
+      break;
+
+    // compute entry in C:
+    NumericT output_value = (current_front_index == min_index) ? scaling_factor * current_front_value : 0;
+    for (unsigned int i = subwarpsize/2; i >= 1; i /= 2)
+      output_value += __shfl_xor((int)output_value, (int)i);
+
+    // update front:
+    if (current_front_index == min_index)
+    {
+      ++input_start;
+      current_front_index = (input_start < input_end) ? load_and_cache(input_indices + input_start) : invalid_token;
+      current_front_value = (input_start < input_end) ? load_and_cache(input_values  + input_start) : 0;
+    }
+
+    // write current front to register buffer:
+    index_buffer = (id_in_warp == buffer_size) ? min_index    : index_buffer;
+    value_buffer = (id_in_warp == buffer_size) ? output_value : value_buffer;
+    ++buffer_size;
+
+    // flush register buffer via a coalesced write once full:
+    if (buffer_size == subwarpsize)
+    {
+      output_indices[id_in_warp] = index_buffer; output_indices += subwarpsize;
+      output_values[id_in_warp]  = value_buffer; output_values  += subwarpsize;
+      buffer_size = 0;
+    }
+
+    ++nnz_written;
+  }
+
+  // write remaining entries in register buffer to C:
+  if (id_in_warp < buffer_size)
+  {
+    output_indices[id_in_warp] = index_buffer;
+    output_values[id_in_warp]  = value_buffer;
+  }
+
+  return nnz_written;
+}
+
+template<typename IndexT, typename NumericT>
+__global__ void compressed_matrix_gemm_stage_3(
+          const IndexT * A_row_indices,
+          const IndexT * A_col_indices,
+          const NumericT * A_elements,
+          IndexT A_size1,
+          const IndexT * B_row_indices,
+          const IndexT * B_col_indices,
+          const NumericT * B_elements,
+          IndexT B_size2,
+          IndexT const * C_row_indices,
+          IndexT * C_col_indices,
+          NumericT * C_elements,
+          unsigned int *subwarpsize_array,
+          unsigned int *max_row_size_A,
+          unsigned int *max_row_size_B,
+          unsigned int *scratchpad_offsets,
+          unsigned int *scratchpad_indices,
+          NumericT *scratchpad_values)
+{
+  unsigned int subwarpsize = subwarpsize_array[blockIdx.x];
+
+  unsigned int num_warps  =  blockDim.x / subwarpsize;
+  unsigned int warp_id    = threadIdx.x / subwarpsize;
+  unsigned int id_in_warp = threadIdx.x % subwarpsize;
+
+  unsigned int scratchpad_rowlength     = max_row_size_B[blockIdx.x] * subwarpsize;
+  unsigned int scratchpad_rows_per_warp = max_row_size_A[blockIdx.x] / subwarpsize + 1;
+  unsigned int subwarp_scratchpad_shift = scratchpad_offsets[blockIdx.x] + warp_id * scratchpad_rows_per_warp * scratchpad_rowlength;
+
+  unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
+  unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
+
+  for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps)
+  {
+    unsigned int row_A_start = A_row_indices[row];
+    unsigned int row_A_end   = A_row_indices[row+1];
+
+    if (row_A_end - row_A_start > subwarpsize)
+    {
+      // first merge stage:
+      unsigned int final_merge_start = 0;
+      unsigned int final_merge_end = 0;
+      unsigned int iter = 0;
+      unsigned int *scratchpad_indices_ptr = scratchpad_indices + subwarp_scratchpad_shift;
+      NumericT     *scratchpad_values_ptr  = scratchpad_values  + subwarp_scratchpad_shift;
+
+      while (row_A_start < row_A_end)
+      {
+        unsigned int my_row_B = row_A_start + id_in_warp;
+        unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
+        unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index)     : 0;
+        unsigned int row_B_end   = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
+        NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
+
+        unsigned int nnz_written = merge_subwarp_numeric(val_A,
+                                                         row_B_start, row_B_end, B_col_indices, B_elements, B_size2,
+                                                         scratchpad_indices_ptr, scratchpad_values_ptr,
+                                                         id_in_warp, subwarpsize);
+
+        if (iter == id_in_warp)
+        {
+          final_merge_start = scratchpad_indices_ptr - scratchpad_indices;
+          final_merge_end   = final_merge_start + nnz_written;
+        }
+        ++iter;
+
+        row_A_start += subwarpsize;
+        scratchpad_indices_ptr += scratchpad_rowlength;
+        scratchpad_values_ptr  += scratchpad_rowlength;
+      }
+
+      // second merge stage:
+      unsigned int index_in_C = C_row_indices[row];
+      merge_subwarp_numeric(NumericT(1),
+                            final_merge_start, final_merge_end, scratchpad_indices, scratchpad_values, B_size2,
+                            C_col_indices + index_in_C, C_elements + index_in_C,
+                            id_in_warp, subwarpsize);
+    }
+    else
+    {
+      unsigned int my_row_B = row_A_start + id_in_warp;
+      unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
+      unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index)     : 0;
+      unsigned int row_B_end   = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
+      NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
+
+      unsigned int index_in_C = C_row_indices[row];
+
+      merge_subwarp_numeric(val_A,
+                            row_B_start, row_B_end, B_col_indices, B_elements, B_size2,
+                            C_col_indices + index_in_C, C_elements + index_in_C,
+                            id_in_warp, subwarpsize);
+    }
+  }
+
+}
+
+
+
+
+//
+// Decomposition kernels:
+//
+template<typename IndexT>
+__global__ void compressed_matrix_gemm_decompose_1(
+          const IndexT * A_row_indices,
+          IndexT A_size1,
+          IndexT max_per_row,
+          IndexT *chunks_per_row)
+{
+  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
+  {
+    IndexT num_entries = A_row_indices[i+1] - A_row_indices[i];
+    chunks_per_row[i] = (num_entries < max_per_row) ? 1 : ((num_entries - 1)/ max_per_row + 1);
+  }
+}
+
+
+template<typename IndexT, typename NumericT>
+__global__ void compressed_matrix_gemm_A2(
+          IndexT * A2_row_indices,
+          IndexT * A2_col_indices,
+          NumericT * A2_elements,
+          IndexT A2_size1,
+          IndexT *new_row_buffer)
+{
+  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A2_size1; i += blockDim.x * gridDim.x)
+  {
+    unsigned int index_start = new_row_buffer[i];
+    unsigned int index_stop  = new_row_buffer[i+1];
+
+    A2_row_indices[i] = index_start;
+
+    for (IndexT j = index_start; j < index_stop; ++j)
+    {
+      A2_col_indices[j] = j;
+      A2_elements[j] = NumericT(1);
+    }
+  }
+
+  // write last entry in row_buffer with global thread 0:
+  if (threadIdx.x == 0 && blockIdx.x == 0)
+    A2_row_indices[A2_size1] = new_row_buffer[A2_size1];
+}
+
+template<typename IndexT, typename NumericT>
+__global__ void compressed_matrix_gemm_G1(
+          IndexT * G1_row_indices,
+          IndexT * G1_col_indices,
+          NumericT * G1_elements,
+          IndexT G1_size1,
+          IndexT const *A_row_indices,
+          IndexT const *A_col_indices,
+          NumericT const *A_elements,
+          IndexT A_size1,
+          IndexT A_nnz,
+          IndexT max_per_row,
+          IndexT *new_row_buffer)
+{
+  // Part 1: Copy column indices and entries:
+  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_nnz; i += blockDim.x * gridDim.x)
+  {
+    G1_col_indices[i] = A_col_indices[i];
+    G1_elements[i]    = A_elements[i];
+  }
+
+  // Part 2: Derive new row indicies:
+  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
+  {
+    unsigned int old_start = A_row_indices[i];
+    unsigned int new_start = new_row_buffer[i];
+    unsigned int row_chunks = new_row_buffer[i+1] - new_start;
+
+    for (IndexT j=0; j<row_chunks; ++j)
+      G1_row_indices[new_start + j] = old_start + j * max_per_row;
+  }
+
+  // write last entry in row_buffer with global thread 0:
+  if (threadIdx.x == 0 && blockIdx.x == 0)
+    G1_row_indices[G1_size1] = A_row_indices[A_size1];
+}
+
+
+
+/** @brief Carries out sparse_matrix-sparse_matrix multiplication for CSR matrices
+*
+* Implementation of the convenience expression C = prod(A, B);
+* Based on computing C(i, :) = A(i, :) * B via merging the respective rows of B
+*
+* @param A     Left factor
+* @param B     Right factor
+* @param C     Result matrix
+*/
+template<class NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+               viennacl::compressed_matrix<NumericT, AlignmentV> const & B,
+               viennacl::compressed_matrix<NumericT, AlignmentV> & C)
+{
+  C.resize(A.size1(), B.size2(), false);
+
+  unsigned int blocknum = 256;
+  unsigned int threadnum = 128;
+
+  viennacl::vector<unsigned int> subwarp_sizes(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+  viennacl::vector<unsigned int> max_nnz_row_A(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+  viennacl::vector<unsigned int> max_nnz_row_B(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+  viennacl::tools::timer timer;
+#endif
+
+  //
+  // Stage 1: Determine upper bound for number of nonzeros
+  //
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+  cudaDeviceSynchronize();
+  timer.start();
+#endif
+
+  compressed_matrix_gemm_stage_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                          viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                          static_cast<unsigned int>(A.size1()),
+                                                          viennacl::cuda_arg<unsigned int>(B.handle1()),
+                                                          viennacl::cuda_arg(subwarp_sizes),
+                                                          viennacl::cuda_arg(max_nnz_row_A),
+                                                          viennacl::cuda_arg(max_nnz_row_B)
+                                                         );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_1");
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+  cudaDeviceSynchronize();
+  std::cout << "Stage 1 device: " << timer.get() << std::endl;
+  timer.start();
+#endif
+
+  subwarp_sizes.switch_memory_context(viennacl::context(MAIN_MEMORY));
+  unsigned int * subwarp_sizes_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(subwarp_sizes.handle());
+
+  max_nnz_row_A.switch_memory_context(viennacl::context(MAIN_MEMORY));
+  unsigned int const * max_nnz_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_A.handle());
+
+  max_nnz_row_B.switch_memory_context(viennacl::context(MAIN_MEMORY));
+  unsigned int const * max_nnz_row_B_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_B.handle());
+
+  //std::cout << "Subwarp sizes: " << subwarp_sizes << std::endl;
+
+  viennacl::vector<unsigned int> scratchpad_offsets(blocknum, viennacl::context(MAIN_MEMORY)); // upper bound for the nonzeros per row encountered for each work group
+  unsigned int * scratchpad_offsets_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(scratchpad_offsets.handle());
+
+  unsigned int max_subwarp_size = 0;
+  unsigned int A_max_nnz_per_row = 0;
+  unsigned int scratchpad_offset = 0;
+  //std::cout << "Scratchpad offsets: " << std::endl;
+  for (std::size_t i=0; i<subwarp_sizes.size(); ++i)
+  {
+    max_subwarp_size = std::max(max_subwarp_size, subwarp_sizes_ptr[i]);
+    A_max_nnz_per_row = std::max(A_max_nnz_per_row, max_nnz_row_A_ptr[i]);
+
+    scratchpad_offsets_ptr[i] = scratchpad_offset;
+    //std::cout << scratchpad_offset << " (with " << (max_nnz_row_A_ptr[i] / subwarp_sizes_ptr[i] + 1) << " warp reloads per group at " << max_nnz_row_A_ptr[i] << " max rows, "
+    //                                            << upper_bound_nonzeros_per_row_C_ptr[i] << " row length, "
+    //                                            << (256 / subwarp_sizes_ptr[i]) << " warps per group " << std::endl;
+    unsigned int max_warp_reloads = max_nnz_row_A_ptr[i] / subwarp_sizes_ptr[i] + 1;
+    unsigned int max_row_length_after_warp_merge = subwarp_sizes_ptr[i] * max_nnz_row_B_ptr[i];
+    unsigned int warps_in_group = threadnum / subwarp_sizes_ptr[i];
+    scratchpad_offset +=  max_warp_reloads
+                        * max_row_length_after_warp_merge
+                        * warps_in_group;
+  }
+  //std::cout << "Scratchpad memory for indices: " << scratchpad_offset << " entries (" << scratchpad_offset * sizeof(unsigned int) * 1e-6 << " MB)" << std::endl;
+
+  if (max_subwarp_size > 32)
+  {
+    // determine augmented size:
+    unsigned int max_entries_in_G = 1024;
+    if (A_max_nnz_per_row <= 512*512)
+      max_entries_in_G = 512;
+    if (A_max_nnz_per_row <= 256*256)
+      max_entries_in_G = 256;
+    if (A_max_nnz_per_row <= 128*128)
+      max_entries_in_G = 128;
+    if (A_max_nnz_per_row <= 64*64)
+      max_entries_in_G = 64;
+
+    viennacl::vector<unsigned int> exclusive_scan_helper(A.size1() + 1, viennacl::traits::context(A));
+    compressed_matrix_gemm_decompose_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                                static_cast<unsigned int>(A.size1()),
+                                                                static_cast<unsigned int>(max_entries_in_G),
+                                                                viennacl::cuda_arg(exclusive_scan_helper)
+                                                               );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_decompose_1");
+
+    thrust::exclusive_scan(thrust::device_ptr<unsigned int>(viennacl::cuda_arg(exclusive_scan_helper)),
+                           thrust::device_ptr<unsigned int>(viennacl::cuda_arg(exclusive_scan_helper) + exclusive_scan_helper.size()),
+                           thrust::device_ptr<unsigned int>(viennacl::cuda_arg(exclusive_scan_helper)));
+
+    unsigned int augmented_size = exclusive_scan_helper[A.size1()];
+
+    // split A = A2 * G1
+    viennacl::compressed_matrix<NumericT, AlignmentV> A2(A.size1(), augmented_size, augmented_size, viennacl::traits::context(A));
+    viennacl::compressed_matrix<NumericT, AlignmentV> G1(augmented_size, A.size2(),        A.nnz(), viennacl::traits::context(A));
+
+    // fill A2:
+    compressed_matrix_gemm_A2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A2.handle1()),
+                                                       viennacl::cuda_arg<unsigned int>(A2.handle2()),
+                                                       viennacl::cuda_arg<NumericT>(A2.handle()),
+                                                       static_cast<unsigned int>(A2.size1()),
+                                                       viennacl::cuda_arg(exclusive_scan_helper)
+                                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_A2");
+
+    // fill G1:
+    compressed_matrix_gemm_G1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(G1.handle1()),
+                                                       viennacl::cuda_arg<unsigned int>(G1.handle2()),
+                                                       viennacl::cuda_arg<NumericT>(G1.handle()),
+                                                       static_cast<unsigned int>(G1.size1()),
+                                                       viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                       viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                       viennacl::cuda_arg<NumericT>(A.handle()),
+                                                       static_cast<unsigned int>(A.size1()),
+                                                       static_cast<unsigned int>(A.nnz()),
+                                                       static_cast<unsigned int>(max_entries_in_G),
+                                                       viennacl::cuda_arg(exclusive_scan_helper)
+                                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_G1");
+
+    // compute tmp = G1 * B;
+    // C = A2 * tmp;
+    viennacl::compressed_matrix<NumericT, AlignmentV> tmp(G1.size1(), B.size2(), 0, viennacl::traits::context(A));
+    prod_impl(G1, B, tmp); // this runs a standard RMerge without decomposition of G1
+    prod_impl(A2, tmp, C); // this may split A2 again
+    return;
+  }
+
+  subwarp_sizes.switch_memory_context(viennacl::traits::context(A));
+  max_nnz_row_A.switch_memory_context(viennacl::traits::context(A));
+  max_nnz_row_B.switch_memory_context(viennacl::traits::context(A));
+  scratchpad_offsets.switch_memory_context(viennacl::traits::context(A));
+
+  viennacl::vector<unsigned int> scratchpad_indices(scratchpad_offset, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+  std::cout << "Intermediate host stage: " << timer.get() << std::endl;
+  timer.start();
+#endif
+
+  //
+  // Stage 2: Determine pattern of C
+  //
+
+  compressed_matrix_gemm_stage_2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                         viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                         static_cast<unsigned int>(A.size1()),
+                                                         viennacl::cuda_arg<unsigned int>(B.handle1()),
+                                                         viennacl::cuda_arg<unsigned int>(B.handle2()),
+                                                         static_cast<unsigned int>(B.size2()),
+                                                         viennacl::cuda_arg<unsigned int>(C.handle1()),
+                                                         viennacl::cuda_arg(subwarp_sizes),
+                                                         viennacl::cuda_arg(max_nnz_row_A),
+                                                         viennacl::cuda_arg(max_nnz_row_B),
+                                                         viennacl::cuda_arg(scratchpad_offsets),
+                                                         viennacl::cuda_arg(scratchpad_indices)
+                                                        );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_2");
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+  cudaDeviceSynchronize();
+  std::cout << "Stage 2: " << timer.get() << std::endl;
+  timer.start();
+#endif
+
+
+  // exclusive scan on C.handle1(), ultimately allowing to allocate remaining memory for C
+  viennacl::backend::typesafe_host_array<unsigned int> row_buffer(C.handle1(), C.size1() + 1);
+  viennacl::backend::memory_read(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+  unsigned int current_offset = 0;
+  for (std::size_t i=0; i<C.size1(); ++i)
+  {
+    unsigned int tmp = row_buffer[i];
+    row_buffer.set(i, current_offset);
+    current_offset += tmp;
+  }
+  row_buffer.set(C.size1(), current_offset);
+  viennacl::backend::memory_write(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+
+
+  //
+  // Stage 3: Compute entries in C
+  //
+  C.reserve(current_offset, false);
+
+  viennacl::vector<NumericT> scratchpad_values(scratchpad_offset, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+  std::cout << "Intermediate stage 2->3: " << timer.get() << std::endl;
+  timer.start();
+#endif
+
+  compressed_matrix_gemm_stage_3<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                          viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                          viennacl::cuda_arg<NumericT>(A.handle()),
+                                                          static_cast<unsigned int>(A.size1()),
+                                                          viennacl::cuda_arg<unsigned int>(B.handle1()),
+                                                          viennacl::cuda_arg<unsigned int>(B.handle2()),
+                                                          viennacl::cuda_arg<NumericT>(B.handle()),
+                                                          static_cast<unsigned int>(B.size2()),
+                                                          viennacl::cuda_arg<unsigned int>(C.handle1()),
+                                                          viennacl::cuda_arg<unsigned int>(C.handle2()),
+                                                          viennacl::cuda_arg<NumericT>(C.handle()),
+                                                          viennacl::cuda_arg(subwarp_sizes),
+                                                          viennacl::cuda_arg(max_nnz_row_A),
+                                                          viennacl::cuda_arg(max_nnz_row_B),
+                                                          viennacl::cuda_arg(scratchpad_offsets),
+                                                          viennacl::cuda_arg(scratchpad_indices),
+                                                          viennacl::cuda_arg(scratchpad_values)
+                                                         );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_3");
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+  cudaDeviceSynchronize();
+  std::cout << "Stage 3: " << timer.get() << std::endl;
+  std::cout << "----------" << std::endl;
+#endif
+
+}
+
+} // namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif


[17/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
new file mode 100644
index 0000000..56e3c14
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
@@ -0,0 +1,705 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
+#define VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/sparse_matrix_operations.hpp
+    @brief Implementations of operations using sparse matrices on the CPU using a single thread or OpenMP.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/linalg/host_based/common.hpp"
+
+
+#ifdef VIENNACL_WITH_AVX2
+#include "immintrin.h"
+#endif
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+
+
+#ifdef VIENNACL_WITH_AVX2
+inline
+unsigned int row_C_scan_symbolic_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end,
+                                             int const *B_row_buffer, int const *B_col_buffer, int B_size2,
+                                             int *row_C_vector_output)
+{
+  __m256i avx_all_ones    = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+  __m256i avx_all_bsize2  = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
+
+  __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
+  __m256i avx_load_mask2 = avx_load_mask;
+
+  __m256i avx_row_indices = _mm256_set1_epi32(0);
+          avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
+            avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256i avx_row_start   = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer,   avx_row_indices, avx_load_mask, 4);
+            avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256i avx_row_end     = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
+
+          avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+  __m256i avx_index_front = avx_all_bsize2;
+  avx_index_front         = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+
+  int *output_ptr = row_C_vector_output;
+
+  while (1)
+  {
+    // get minimum index in current front:
+    __m256i avx_index_min1 = avx_index_front;
+    __m256i avx_temp       = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
+    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements
+
+    avx_temp       = _mm256_shuffle_epi32(avx_index_min1, int(78));    // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency
+    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register)
+
+    avx_temp       = _mm256_shuffle_epi32(avx_index_min1, int(177));    // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency
+    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum
+
+    int min_index_in_front = ((int*)&avx_index_min1)[0];
+    // check for end of merge operation:
+    if (min_index_in_front == B_size2)
+      break;
+
+    // write current entry:
+    *output_ptr = min_index_in_front;
+    ++output_ptr;
+
+    // advance index front where equal to minimum index:
+    avx_load_mask   = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
+    // first part: set index to B_size2 if equal to minimum index:
+    avx_temp        = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
+    avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
+    // second part: increment row_start registers where minimum found:
+    avx_temp        = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found
+    avx_row_start   = _mm256_add_epi32(avx_row_start, avx_temp);
+    // third part part: load new data where more entries available:
+    avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+    avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+  }
+
+  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+#endif
+
+/** @brief Merges up to IndexNum rows from B into the result buffer.
+*
+* Because the input buffer also needs to be considered, this routine actually works on an index front of length (IndexNum+1)
+**/
+template<unsigned int IndexNum>
+unsigned int row_C_scan_symbolic_vector_N(unsigned int const *row_indices_B,
+                                          unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2,
+                                          unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end,
+                                          unsigned int *row_C_vector_output)
+{
+  unsigned int index_front[IndexNum+1];
+  unsigned int const *index_front_start[IndexNum+1];
+  unsigned int const *index_front_end[IndexNum+1];
+
+  // Set up pointers for loading the indices:
+  for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
+  {
+    index_front_start[i] = B_col_buffer + B_row_buffer[*row_indices_B];
+    index_front_end[i]   = B_col_buffer + B_row_buffer[*row_indices_B + 1];
+  }
+  index_front_start[IndexNum] = row_C_vector_input;
+  index_front_end[IndexNum]   = row_C_vector_input_end;
+
+  // load indices:
+  for (unsigned int i=0; i<=IndexNum; ++i)
+    index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+
+  unsigned int *output_ptr = row_C_vector_output;
+
+  while (1)
+  {
+    // get minimum index in current front:
+    unsigned int min_index_in_front = B_size2;
+    for (unsigned int i=0; i<=IndexNum; ++i)
+      min_index_in_front = std::min(min_index_in_front, index_front[i]);
+
+    if (min_index_in_front == B_size2) // we're done
+      break;
+
+    // advance index front where equal to minimum index:
+    for (unsigned int i=0; i<=IndexNum; ++i)
+    {
+      if (index_front[i] == min_index_in_front)
+      {
+        index_front_start[i] += 1;
+        index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+      }
+    }
+
+    // write current entry:
+    *output_ptr = min_index_in_front;
+    ++output_ptr;
+  }
+
+  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+
+struct spgemm_output_write_enabled  { static void apply(unsigned int *ptr, unsigned int value) { *ptr = value; } };
+struct spgemm_output_write_disabled { static void apply(unsigned int *   , unsigned int      ) {               } };
+
+template<typename OutputWriterT>
+unsigned int row_C_scan_symbolic_vector_1(unsigned int const *input1_begin, unsigned int const *input1_end,
+                                          unsigned int const *input2_begin, unsigned int const *input2_end,
+                                          unsigned int termination_index,
+                                          unsigned int *output_begin)
+{
+  unsigned int *output_ptr = output_begin;
+
+  unsigned int val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
+  unsigned int val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
+  while (1)
+  {
+    unsigned int min_index = std::min(val_1, val_2);
+
+    if (min_index == termination_index)
+      break;
+
+    if (min_index == val_1)
+    {
+      ++input1_begin;
+      val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
+    }
+
+    if (min_index == val_2)
+    {
+      ++input2_begin;
+      val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
+    }
+
+    // write current entry:
+    OutputWriterT::apply(output_ptr, min_index); // *output_ptr = min_index;    if necessary
+    ++output_ptr;
+  }
+
+  return static_cast<unsigned int>(output_ptr - output_begin);
+}
+
+inline
+unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer,
+                                        unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2,
+                                        unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3)
+{
+  // Trivial case: row length 0:
+  if (row_start_A == row_end_A)
+    return 0;
+
+  // Trivial case: row length 1:
+  if (row_end_A - row_start_A == 1)
+  {
+    unsigned int A_col = A_col_buffer[row_start_A];
+    return B_row_buffer[A_col + 1] - B_row_buffer[A_col];
+  }
+
+  // Optimizations for row length 2:
+  unsigned int row_C_len = 0;
+  if (row_end_A - row_start_A == 2)
+  {
+    unsigned int A_col_1 = A_col_buffer[row_start_A];
+    unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+    return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
+                                                                      B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
+                                                                      B_size2,
+                                                                      row_C_vector_1);
+  }
+  else // for more than two rows we can safely merge the first two:
+  {
+#ifdef VIENNACL_WITH_AVX2
+    row_C_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A),
+                                                (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2),
+                                                (int*)row_C_vector_1);
+    row_start_A += 8;
+#else
+    unsigned int A_col_1 = A_col_buffer[row_start_A];
+    unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+    row_C_len =  row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
+                                                                           B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
+                                                                           B_size2,
+                                                                           row_C_vector_1);
+    row_start_A += 2;
+#endif
+  }
+
+  // all other row lengths:
+  while (row_end_A > row_start_A)
+  {
+#ifdef VIENNACL_WITH_AVX2
+    if (row_end_A - row_start_A > 2) // we deal with one or two remaining rows more efficiently below:
+    {
+      unsigned int merged_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A),
+                                                                (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2),
+                                                                (int*)row_C_vector_3);
+      if (row_start_A + 8 >= row_end_A)
+        row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+                                                                              row_C_vector_1, row_C_vector_1 + row_C_len,
+                                                                              B_size2,
+                                                                              row_C_vector_2);
+      else
+        row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+                                                                               row_C_vector_1, row_C_vector_1 + row_C_len,
+                                                                               B_size2,
+                                                                               row_C_vector_2);
+      row_start_A += 8;
+    }
+    else
+#endif
+    if (row_start_A == row_end_A - 1) // last merge operation. No need to write output
+    {
+      // process last row
+      unsigned int row_index_B = A_col_buffer[row_start_A];
+      return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
+                                                                        row_C_vector_1, row_C_vector_1 + row_C_len,
+                                                                        B_size2,
+                                                                        row_C_vector_2);
+    }
+    else if (row_start_A + 1 < row_end_A)// at least two more rows left, so merge them
+    {
+      // process single row:
+      unsigned int A_col_1 = A_col_buffer[row_start_A];
+      unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+      unsigned int merged_len =  row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
+                                                                                           B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
+                                                                                           B_size2,
+                                                                                           row_C_vector_3);
+      if (row_start_A + 2 == row_end_A) // last merge does not need a write:
+        return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+                                                                          row_C_vector_1, row_C_vector_1 + row_C_len,
+                                                                          B_size2,
+                                                                          row_C_vector_2);
+      else
+        row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+                                                                              row_C_vector_1, row_C_vector_1 + row_C_len,
+                                                                              B_size2,
+                                                                              row_C_vector_2);
+      row_start_A += 2;
+    }
+    else // at least two more rows left
+    {
+      // process single row:
+      unsigned int row_index_B = A_col_buffer[row_start_A];
+      row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
+                                                                            row_C_vector_1, row_C_vector_1 + row_C_len,
+                                                                            B_size2,
+                                                                            row_C_vector_2);
+      ++row_start_A;
+    }
+
+    std::swap(row_C_vector_1, row_C_vector_2);
+  }
+
+  return row_C_len;
+}
+
+//////////////////////////////
+
+/** @brief Merges up to IndexNum rows from B into the result buffer.
+*
+* Because the input buffer also needs to be considered, this routine actually works on an index front of length (IndexNum+1)
+**/
+template<unsigned int IndexNum, typename NumericT>
+unsigned int row_C_scan_numeric_vector_N(unsigned int const *row_indices_B, NumericT const *val_A,
+                                          unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2,
+                                          unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, NumericT *row_C_vector_input_values,
+                                          unsigned int *row_C_vector_output, NumericT *row_C_vector_output_values)
+{
+  unsigned int index_front[IndexNum+1];
+  unsigned int const *index_front_start[IndexNum+1];
+  unsigned int const *index_front_end[IndexNum+1];
+  NumericT const * value_front_start[IndexNum+1];
+  NumericT values_A[IndexNum+1];
+
+  // Set up pointers for loading the indices:
+  for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
+  {
+    unsigned int row_B = *row_indices_B;
+
+    index_front_start[i] = B_col_buffer + B_row_buffer[row_B];
+    index_front_end[i]   = B_col_buffer + B_row_buffer[row_B + 1];
+    value_front_start[i] = B_elements   + B_row_buffer[row_B];
+    values_A[i]          = val_A[i];
+  }
+  index_front_start[IndexNum] = row_C_vector_input;
+  index_front_end[IndexNum]   = row_C_vector_input_end;
+  value_front_start[IndexNum] = row_C_vector_input_values;
+  values_A[IndexNum]          = NumericT(1);
+
+  // load indices:
+  for (unsigned int i=0; i<=IndexNum; ++i)
+    index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+
+  unsigned int *output_ptr = row_C_vector_output;
+
+  while (1)
+  {
+    // get minimum index in current front:
+    unsigned int min_index_in_front = B_size2;
+    for (unsigned int i=0; i<=IndexNum; ++i)
+      min_index_in_front = std::min(min_index_in_front, index_front[i]);
+
+    if (min_index_in_front == B_size2) // we're done
+      break;
+
+    // advance index front where equal to minimum index:
+    NumericT row_C_value = 0;
+    for (unsigned int i=0; i<=IndexNum; ++i)
+    {
+      if (index_front[i] == min_index_in_front)
+      {
+        index_front_start[i] += 1;
+        index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+
+        row_C_value += values_A[i] * *value_front_start[i];
+        value_front_start[i] += 1;
+      }
+    }
+
+    // write current entry:
+    *output_ptr = min_index_in_front;
+    ++output_ptr;
+    *row_C_vector_output_values = row_C_value;
+    ++row_C_vector_output_values;
+  }
+
+  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+
+
+
+#ifdef VIENNACL_WITH_AVX2
+inline
+unsigned int row_C_scan_numeric_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end, double const *values_A,
+                                             int const *B_row_buffer, int const *B_col_buffer, double const *B_elements,
+                                             int B_size2,
+                                             int *row_C_vector_output, double *row_C_vector_output_values)
+{
+  __m256i avx_all_ones    = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+  __m256i avx_all_bsize2  = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
+
+  __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
+  __m256i avx_load_mask2 = avx_load_mask;
+
+  __m256i avx_row_indices = _mm256_set1_epi32(0);
+          avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
+
+  // load values from A:
+  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256d avx_value_A_low  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+                                                      values_A,                  //base ptr
+                                                      _mm256_extractf128_si256(avx_row_indices_offsets, 0),                           //indices
+                                                      _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
+  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256d avx_value_A_high  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+                                                       values_A,                  //base ptr
+                                                       _mm256_extractf128_si256(avx_row_indices_offsets, 1),                           //indices
+                                                       _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
+
+
+            avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256i avx_row_start   = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer,   avx_row_indices, avx_load_mask, 4);
+            avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256i avx_row_end     = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
+
+          avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+          avx_load_mask2  = avx_load_mask;
+  __m256i avx_index_front = avx_all_bsize2;
+  avx_index_front         = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+
+  // load front values from B:
+  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256d avx_value_front_low  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+                                                          B_elements,                  //base ptr
+                                                          _mm256_extractf128_si256(avx_row_start, 0),                           //indices
+                                                          _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
+  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256d avx_value_front_high  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+                                                           B_elements,                  //base ptr
+                                                           _mm256_extractf128_si256(avx_row_start, 1),                           //indices
+                                                           _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
+
+  int *output_ptr = row_C_vector_output;
+
+  while (1)
+  {
+    // get minimum index in current front:
+    __m256i avx_index_min1 = avx_index_front;
+    __m256i avx_temp       = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
+    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements
+
+    avx_temp       = _mm256_shuffle_epi32(avx_index_min1, int(78));    // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency
+    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register)
+
+    avx_temp       = _mm256_shuffle_epi32(avx_index_min1, int(177));    // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency
+    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum
+
+    int min_index_in_front = ((int*)&avx_index_min1)[0];
+    // check for end of merge operation:
+    if (min_index_in_front == B_size2)
+      break;
+
+    // accumulate value (can certainly be done more elegantly...)
+    double value = 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[0]) ? ((double*)&avx_value_front_low)[0] * ((double*)&avx_value_A_low)[0] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[1]) ? ((double*)&avx_value_front_low)[1] * ((double*)&avx_value_A_low)[1] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[2]) ? ((double*)&avx_value_front_low)[2] * ((double*)&avx_value_A_low)[2] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[3]) ? ((double*)&avx_value_front_low)[3] * ((double*)&avx_value_A_low)[3] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[4]) ? ((double*)&avx_value_front_high)[0] * ((double*)&avx_value_A_high)[0] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[5]) ? ((double*)&avx_value_front_high)[1] * ((double*)&avx_value_A_high)[1] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[6]) ? ((double*)&avx_value_front_high)[2] * ((double*)&avx_value_A_high)[2] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[7]) ? ((double*)&avx_value_front_high)[3] * ((double*)&avx_value_A_high)[3] : 0;
+    *row_C_vector_output_values = value;
+    ++row_C_vector_output_values;
+
+    // write current entry:
+    *output_ptr = min_index_in_front;
+    ++output_ptr;
+
+    // advance index front where equal to minimum index:
+    avx_load_mask   = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
+    // first part: set index to B_size2 if equal to minimum index:
+    avx_temp        = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
+    avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
+    // second part: increment row_start registers where minimum found:
+    avx_temp        = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found
+    avx_row_start   = _mm256_add_epi32(avx_row_start, avx_temp);
+    // third part part: load new data where more entries available:
+    avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+    avx_load_mask2  = avx_load_mask;
+    avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+
+    // load new values where necessary:
+    avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+    avx_value_front_low = _mm256_mask_i32gather_pd(avx_value_front_low, //src
+                                            B_elements,                  //base ptr
+                                            _mm256_extractf128_si256(avx_row_start, 0),                           //indices
+                                            _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
+
+    avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+    avx_value_front_high = _mm256_mask_i32gather_pd(avx_value_front_high, //src
+                                    B_elements,                  //base ptr
+                                    _mm256_extractf128_si256(avx_row_start, 1),                           //indices
+                                    _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
+
+    //multiply new entries:
+
+  }
+
+  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+#endif
+
+
+template<typename NumericT>
+unsigned int row_C_scan_numeric_vector_1(unsigned int const *input1_index_begin, unsigned int const *input1_index_end, NumericT const *input1_values_begin, NumericT factor1,
+                                         unsigned int const *input2_index_begin, unsigned int const *input2_index_end, NumericT const *input2_values_begin, NumericT factor2,
+                                         unsigned int termination_index,
+                                         unsigned int *output_index_begin, NumericT *output_values_begin)
+{
+  unsigned int *output_ptr = output_index_begin;
+
+  unsigned int index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
+  unsigned int index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
+
+  while (1)
+  {
+    unsigned int min_index = std::min(index1, index2);
+    NumericT value = 0;
+
+    if (min_index == termination_index)
+      break;
+
+    if (min_index == index1)
+    {
+      ++input1_index_begin;
+      index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
+
+      value += factor1 * *input1_values_begin;
+      ++input1_values_begin;
+    }
+
+    if (min_index == index2)
+    {
+      ++input2_index_begin;
+      index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
+
+      value += factor2 * *input2_values_begin;
+      ++input2_values_begin;
+    }
+
+    // write current entry:
+    *output_ptr = min_index;
+    ++output_ptr;
+    *output_values_begin = value;
+    ++output_values_begin;
+  }
+
+  return static_cast<unsigned int>(output_ptr - output_index_begin);
+}
+
+template<typename NumericT>
+void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements,
+                               unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2,
+                               unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements,
+                               unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values,
+                               unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values,
+                               unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values)
+{
+  (void)row_end_C;
+
+  // Trivial case: row length 0:
+  if (row_start_A == row_end_A)
+    return;
+
+  // Trivial case: row length 1:
+  if (row_end_A - row_start_A == 1)
+  {
+    unsigned int A_col = A_col_buffer[row_start_A];
+    unsigned int B_end = B_row_buffer[A_col + 1];
+    NumericT A_value   = A_elements[row_start_A];
+    C_col_buffer += row_start_C;
+    C_elements += row_start_C;
+    for (unsigned int j = B_row_buffer[A_col]; j < B_end; ++j, ++C_col_buffer, ++C_elements)
+    {
+      *C_col_buffer = B_col_buffer[j];
+      *C_elements = A_value * B_elements[j];
+    }
+    return;
+  }
+
+  unsigned int row_C_len = 0;
+  if (row_end_A - row_start_A == 2) // directly merge to C:
+  {
+    unsigned int A_col_1 = A_col_buffer[row_start_A];
+    unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+
+    unsigned int B_offset_1 = B_row_buffer[A_col_1];
+    unsigned int B_offset_2 = B_row_buffer[A_col_2];
+
+    row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
+                                B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
+                                B_size2,
+                                C_col_buffer + row_start_C, C_elements + row_start_C);
+    return;
+  }
+#ifdef VIENNACL_WITH_AVX2
+  else if (row_end_A - row_start_A > 10) // safely merge eight rows into temporary buffer:
+  {
+    row_C_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
+                                               (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2),
+                                               (int*)row_C_vector_1, row_C_vector_1_values);
+    row_start_A += 8;
+  }
+#endif
+  else // safely merge two rows into temporary buffer:
+  {
+    unsigned int A_col_1 = A_col_buffer[row_start_A];
+    unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+
+    unsigned int B_offset_1 = B_row_buffer[A_col_1];
+    unsigned int B_offset_2 = B_row_buffer[A_col_2];
+
+    row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
+                                            B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
+                                            B_size2,
+                                            row_C_vector_1, row_C_vector_1_values);
+    row_start_A += 2;
+  }
+
+  // process remaining rows:
+  while (row_end_A > row_start_A)
+  {
+#ifdef VIENNACL_WITH_AVX2
+    if (row_end_A - row_start_A > 9) // code in other if-conditionals ensures that values get written to C
+    {
+      unsigned int merged_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
+                                                               (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2),
+                                                               (int*)row_C_vector_3, row_C_vector_3_values);
+      row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0),
+                                              row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+                                              B_size2,
+                                              row_C_vector_2, row_C_vector_2_values);
+      row_start_A += 8;
+    }
+    else
+#endif
+    if (row_start_A + 1 == row_end_A) // last row to merge, write directly to C:
+    {
+      unsigned int A_col    = A_col_buffer[row_start_A];
+      unsigned int B_offset = B_row_buffer[A_col];
+
+      row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
+                                              row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+                                              B_size2,
+                                              C_col_buffer + row_start_C, C_elements + row_start_C);
+      return;
+    }
+    else if (row_start_A + 2 < row_end_A)// at least three more rows left, so merge two
+    {
+      // process single row:
+      unsigned int A_col_1 = A_col_buffer[row_start_A];
+      unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+
+      unsigned int B_offset_1 = B_row_buffer[A_col_1];
+      unsigned int B_offset_2 = B_row_buffer[A_col_2];
+
+      unsigned int merged_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
+                                                            B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
+                                                            B_size2,
+                                                            row_C_vector_3, row_C_vector_3_values);
+      row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0),
+                                              row_C_vector_1, row_C_vector_1 + row_C_len,  row_C_vector_1_values, NumericT(1.0),
+                                              B_size2,
+                                              row_C_vector_2, row_C_vector_2_values);
+      row_start_A += 2;
+    }
+    else
+    {
+      unsigned int A_col    = A_col_buffer[row_start_A];
+      unsigned int B_offset = B_row_buffer[A_col];
+
+      row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
+                                              row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+                                              B_size2,
+                                              row_C_vector_2, row_C_vector_2_values);
+      ++row_start_A;
+    }
+
+    std::swap(row_C_vector_1,        row_C_vector_2);
+    std::swap(row_C_vector_1_values, row_C_vector_2_values);
+  }
+}
+
+
+} // namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
new file mode 100644
index 0000000..b4944a2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
@@ -0,0 +1,1188 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/vector_operations.hpp
+    @brief Implementations of vector operations using a plain single-threaded or OpenMP-enabled execution on CPU
+*/
+
+#include <cmath>
+#include <algorithm>  //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/detail/op_applier.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#ifdef VIENNACL_WITH_OPENMP
+#include <omp.h>
+#endif
+
+// Minimum vector size for using OpenMP on vector operations:
+#ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE
+  #define VIENNACL_OPENMP_VECTOR_MIN_SIZE  5000
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+namespace detail
+{
+  template<typename NumericT>
+  NumericT flip_sign(NumericT val) { return -val; }
+  inline unsigned long  flip_sign(unsigned long  val) { return val; }
+  inline unsigned int   flip_sign(unsigned int   val) { return val; }
+  inline unsigned short flip_sign(unsigned short val) { return val; }
+  inline unsigned char  flip_sign(unsigned char  val) { return val; }
+}
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+template<typename DestNumericT, typename SrcNumericT>
+void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src)
+{
+  DestNumericT      * data_dest = detail::extract_raw_pointer<DestNumericT>(dest);
+  SrcNumericT const * data_src  = detail::extract_raw_pointer<SrcNumericT>(src);
+
+  vcl_size_t start_dest = viennacl::traits::start(dest);
+  vcl_size_t inc_dest   = viennacl::traits::stride(dest);
+  vcl_size_t size_dest  = viennacl::traits::size(dest);
+
+  vcl_size_t start_src = viennacl::traits::start(src);
+  vcl_size_t inc_src   = viennacl::traits::stride(src);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size_dest > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(size_dest); ++i)
+    data_dest[static_cast<vcl_size_t>(i)*inc_dest+start_dest] = static_cast<DestNumericT>(data_src[static_cast<vcl_size_t>(i)*inc_src+start_src]);
+}
+
+template<typename NumericT, typename ScalarT1>
+void av(vector_base<NumericT> & vec1,
+        vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+  typedef NumericT        value_type;
+
+  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = detail::flip_sign(data_alpha);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(vec2);
+  vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+  if (reciprocal_alpha)
+  {
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+    for (long i = 0; i < static_cast<long>(size1); ++i)
+      data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha;
+  }
+  else
+  {
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+    for (long i = 0; i < static_cast<long>(size1); ++i)
+      data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha;
+  }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv(vector_base<NumericT> & vec1,
+          vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /* len_alpha */, bool reciprocal_alpha, bool flip_sign_alpha,
+          vector_base<NumericT> const & vec3, ScalarT2 const & beta,  vcl_size_t /* len_beta */,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef NumericT      value_type;
+
+  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+  value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = detail::flip_sign(data_alpha);
+
+  value_type data_beta = beta;
+  if (flip_sign_beta)
+    data_beta = detail::flip_sign(data_beta);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(vec2);
+  vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+  vcl_size_t start3 = viennacl::traits::start(vec3);
+  vcl_size_t inc3   = viennacl::traits::stride(vec3);
+
+  if (reciprocal_alpha)
+  {
+    if (reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+    }
+    else
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+    }
+  }
+  else
+  {
+    if (reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+    }
+    else
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+    }
+  }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv_v(vector_base<NumericT> & vec1,
+            vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+            vector_base<NumericT> const & vec3, ScalarT2 const & beta,  vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef NumericT        value_type;
+
+  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+  value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = detail::flip_sign(data_alpha);
+
+  value_type data_beta = beta;
+  if (flip_sign_beta)
+    data_beta = detail::flip_sign(data_beta);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(vec2);
+  vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+  vcl_size_t start3 = viennacl::traits::start(vec3);
+  vcl_size_t inc3   = viennacl::traits::stride(vec3);
+
+  if (reciprocal_alpha)
+  {
+    if (reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+    }
+    else
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+    }
+  }
+  else
+  {
+    if (reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+    }
+    else
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+    }
+  }
+}
+
+
+
+
+/** @brief Assign a constant value to a vector (-range/-slice)
+*
+* @param vec1   The vector to which the value should be assigned
+* @param alpha  The value to be assigned
+* @param up_to_internal_size  Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
+*/
+template<typename NumericT>
+void vector_assign(vector_base<NumericT> & vec1, const NumericT & alpha, bool up_to_internal_size = false)
+{
+  typedef NumericT       value_type;
+
+  value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+  vcl_size_t loop_bound  = up_to_internal_size ? vec1.internal_size() : size1;  //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
+
+  value_type data_alpha = static_cast<value_type>(alpha);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (loop_bound > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(loop_bound); ++i)
+    data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha;
+}
+
+
+/** @brief Swaps the contents of two vectors, data is copied
+*
+* @param vec1   The first vector (or -range, or -slice)
+* @param vec2   The second vector (or -range, or -slice)
+*/
+template<typename NumericT>
+void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2)
+{
+  typedef NumericT      value_type;
+
+  value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(vec2);
+  vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(size1); ++i)
+  {
+    value_type temp = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2];
+    data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
+    data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = temp;
+  }
+}
+
+
+///////////////////////// Elementwise operations /////////////
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3    (using MATLAB syntax)
+*
+* @param vec1   The result vector (or -range, or -slice)
+* @param proxy  The proxy object holding v2, v3 and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy)
+{
+  typedef NumericT                                           value_type;
+  typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> >    OpFunctor;
+
+  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs());
+  value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(proxy.rhs());
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(proxy.lhs());
+  vcl_size_t inc2   = viennacl::traits::stride(proxy.lhs());
+
+  vcl_size_t start3 = viennacl::traits::start(proxy.rhs());
+  vcl_size_t inc3   = viennacl::traits::stride(proxy.rhs());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(size1); ++i)
+    OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], data_vec2[static_cast<vcl_size_t>(i)*inc2+start2], data_vec3[static_cast<vcl_size_t>(i)*inc3+start3]);
+}
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3    (using MATLAB syntax)
+*
+* @param vec1   The result vector (or -range, or -slice)
+* @param proxy  The proxy object holding v2, v3 and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<OpT> > const & proxy)
+{
+  typedef NumericT      value_type;
+  typedef viennacl::linalg::detail::op_applier<op_element_unary<OpT> >    OpFunctor;
+
+  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs());
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(proxy.lhs());
+  vcl_size_t inc2   = viennacl::traits::stride(proxy.lhs());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(size1); ++i)
+    OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]);
+}
+
+
+///////////////////////// Norms and inner product ///////////////////
+
+
+//implementation of inner product:
+
+namespace detail
+{
+
+// the following circumvents problems when trying to use a variable of template parameter type for a reduction.
+// Such a behavior is not covered by the OpenMP standard, hence we manually apply some preprocessor magic to resolve the problem.
+// See https://github.com/viennacl/viennacl-dev/issues/112 for a detailed explanation and discussion.
+
+#define VIENNACL_INNER_PROD_IMPL_1(RESULTSCALART, TEMPSCALART) \
+  inline RESULTSCALART inner_prod_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1, \
+                                       RESULTSCALART const * data_vec2, vcl_size_t start2, vcl_size_t inc2) { \
+    TEMPSCALART temp = 0;
+
+#define VIENNACL_INNER_PROD_IMPL_2(RESULTSCALART) \
+    for (long i = 0; i < static_cast<long>(size1); ++i) \
+      temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] * data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]; \
+    return static_cast<RESULTSCALART>(temp); \
+  }
+
+// char
+VIENNACL_INNER_PROD_IMPL_1(char, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(char)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned char, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned char)
+
+
+// short
+VIENNACL_INNER_PROD_IMPL_1(short, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(short)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned short, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned short)
+
+
+// int
+VIENNACL_INNER_PROD_IMPL_1(int, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(int)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned int, unsigned int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned int)
+
+
+// long
+VIENNACL_INNER_PROD_IMPL_1(long, long)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(long)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned long, unsigned long)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned long)
+
+
+// float
+VIENNACL_INNER_PROD_IMPL_1(float, float)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(float)
+
+// double
+VIENNACL_INNER_PROD_IMPL_1(double, double)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(double)
+
+#undef VIENNACL_INNER_PROD_IMPL_1
+#undef VIENNACL_INNER_PROD_IMPL_2
+}
+
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the gpu)
+*/
+template<typename NumericT, typename ScalarT>
+void inner_prod_impl(vector_base<NumericT> const & vec1,
+                     vector_base<NumericT> const & vec2,
+                     ScalarT & result)
+{
+  typedef NumericT      value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(vec2);
+  vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+  result = detail::inner_prod_impl(data_vec1, start1, inc1, size1,
+                                   data_vec2, start2, inc2);  //Note: Assignment to result might be expensive, thus a temporary is introduced here
+}
+
+template<typename NumericT>
+void inner_prod_impl(vector_base<NumericT> const & x,
+                     vector_tuple<NumericT> const & vec_tuple,
+                     vector_base<NumericT> & result)
+{
+  typedef NumericT        value_type;
+
+  value_type const * data_x = detail::extract_raw_pointer<value_type>(x);
+
+  vcl_size_t start_x = viennacl::traits::start(x);
+  vcl_size_t inc_x   = viennacl::traits::stride(x);
+  vcl_size_t size_x  = viennacl::traits::size(x);
+
+  std::vector<value_type> temp(vec_tuple.const_size());
+  std::vector<value_type const *> data_y(vec_tuple.const_size());
+  std::vector<vcl_size_t> start_y(vec_tuple.const_size());
+  std::vector<vcl_size_t> stride_y(vec_tuple.const_size());
+
+  for (vcl_size_t j=0; j<vec_tuple.const_size(); ++j)
+  {
+    data_y[j] = detail::extract_raw_pointer<value_type>(vec_tuple.const_at(j));
+    start_y[j] = viennacl::traits::start(vec_tuple.const_at(j));
+    stride_y[j] = viennacl::traits::stride(vec_tuple.const_at(j));
+  }
+
+  // Note: No OpenMP here because it cannot perform a reduction on temp-array. Savings in memory bandwidth are expected to still justify this approach...
+  for (vcl_size_t i = 0; i < size_x; ++i)
+  {
+    value_type entry_x = data_x[i*inc_x+start_x];
+    for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
+      temp[j] += entry_x * data_y[j][i*stride_y[j]+start_y[j]];
+  }
+
+  for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
+    result[j] = temp[j];  //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+
+namespace detail
+{
+
+#define VIENNACL_NORM_1_IMPL_1(RESULTSCALART, TEMPSCALART) \
+  inline RESULTSCALART norm_1_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1) { \
+    TEMPSCALART temp = 0;
+
+#define VIENNACL_NORM_1_IMPL_2(RESULTSCALART, TEMPSCALART) \
+    for (long i = 0; i < static_cast<long>(size1); ++i) \
+      temp += static_cast<TEMPSCALART>(std::fabs(static_cast<double>(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]))); \
+    return static_cast<RESULTSCALART>(temp); \
+  }
+
+// char
+VIENNACL_NORM_1_IMPL_1(char, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(char, int)
+
+VIENNACL_NORM_1_IMPL_1(unsigned char, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned char, int)
+
+// short
+VIENNACL_NORM_1_IMPL_1(short, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(short, int)
+
+VIENNACL_NORM_1_IMPL_1(unsigned short, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned short, int)
+
+
+// int
+VIENNACL_NORM_1_IMPL_1(int, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(int, int)
+
+VIENNACL_NORM_1_IMPL_1(unsigned int, unsigned int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned int, unsigned int)
+
+
+// long
+VIENNACL_NORM_1_IMPL_1(long, long)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(long, long)
+
+VIENNACL_NORM_1_IMPL_1(unsigned long, unsigned long)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned long, unsigned long)
+
+
+// float
+VIENNACL_NORM_1_IMPL_1(float, float)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(float, float)
+
+// double
+VIENNACL_NORM_1_IMPL_1(double, double)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(double, double)
+
+#undef VIENNACL_NORM_1_IMPL_1
+#undef VIENNACL_NORM_1_IMPL_2
+
+}
+
+/** @brief Computes the l^1-norm of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void norm_1_impl(vector_base<NumericT> const & vec1,
+                 ScalarT & result)
+{
+  typedef NumericT        value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  result = detail::norm_1_impl(data_vec1, start1, inc1, size1);  //Note: Assignment to result might be expensive, thus using a temporary for accumulation
+}
+
+
+
+namespace detail
+{
+
+#define VIENNACL_NORM_2_IMPL_1(RESULTSCALART, TEMPSCALART) \
+  inline RESULTSCALART norm_2_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1) { \
+    TEMPSCALART temp = 0;
+
+#define VIENNACL_NORM_2_IMPL_2(RESULTSCALART, TEMPSCALART) \
+    for (long i = 0; i < static_cast<long>(size1); ++i) { \
+      RESULTSCALART data = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]; \
+      temp += static_cast<TEMPSCALART>(data * data); \
+    } \
+    return static_cast<RESULTSCALART>(temp); \
+  }
+
+// char
+VIENNACL_NORM_2_IMPL_1(char, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(char, int)
+
+VIENNACL_NORM_2_IMPL_1(unsigned char, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned char, int)
+
+
+// short
+VIENNACL_NORM_2_IMPL_1(short, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(short, int)
+
+VIENNACL_NORM_2_IMPL_1(unsigned short, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned short, int)
+
+
+// int
+VIENNACL_NORM_2_IMPL_1(int, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(int, int)
+
+VIENNACL_NORM_2_IMPL_1(unsigned int, unsigned int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned int, unsigned int)
+
+
+// long
+VIENNACL_NORM_2_IMPL_1(long, long)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(long, long)
+
+VIENNACL_NORM_2_IMPL_1(unsigned long, unsigned long)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned long, unsigned long)
+
+
+// float
+VIENNACL_NORM_2_IMPL_1(float, float)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(float, float)
+
+// double
+VIENNACL_NORM_2_IMPL_1(double, double)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(double, double)
+
+#undef VIENNACL_NORM_2_IMPL_1
+#undef VIENNACL_NORM_2_IMPL_2
+
+}
+
+
+/** @brief Computes the l^2-norm of a vector - implementation
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void norm_2_impl(vector_base<NumericT> const & vec1,
+                 ScalarT & result)
+{
+  typedef NumericT       value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  result = std::sqrt(detail::norm_2_impl(data_vec1, start1, inc1, size1));  //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes the supremum-norm of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void norm_inf_impl(vector_base<NumericT> const & vec1,
+                   ScalarT & result)
+{
+  typedef NumericT       value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t thread_count=1;
+
+  #ifdef VIENNACL_WITH_OPENMP
+  if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+      thread_count = omp_get_max_threads();
+  #endif
+
+  std::vector<value_type> temp(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  {
+    vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+    id = omp_get_thread_num();
+#endif
+
+    vcl_size_t begin = (size1 * id) / thread_count;
+    vcl_size_t end   = (size1 * (id + 1)) / thread_count;
+    temp[id]         = 0;
+
+    for (vcl_size_t i = begin; i < end; ++i)
+      temp[id] = std::max<value_type>(temp[id], static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1]))));  //casting to double in order to avoid problems if T is an integer type
+  }
+  for (vcl_size_t i = 1; i < thread_count; ++i)
+    temp[0] = std::max<value_type>( temp[0], temp[i]);
+  result  = temp[0];
+}
+
+//This function should return a CPU scalar, otherwise statements like
+// vcl_rhs[index_norm_inf(vcl_rhs)]
+// are ambiguous
+/** @brief Computes the index of the first entry that is equal to the supremum-norm in modulus.
+*
+* @param vec1 The vector
+* @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types.
+*/
+template<typename NumericT>
+vcl_size_t index_norm_inf(vector_base<NumericT> const & vec1)
+{
+  typedef NumericT      value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+  vcl_size_t thread_count=1;
+
+#ifdef VIENNACL_WITH_OPENMP
+  if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+      thread_count = omp_get_max_threads();
+#endif
+
+  std::vector<value_type> temp(thread_count);
+  std::vector<vcl_size_t> index(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  {
+    vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+    id = omp_get_thread_num();
+#endif
+    vcl_size_t begin = (size1 * id) / thread_count;
+    vcl_size_t end   = (size1 * (id + 1)) / thread_count;
+    index[id]        = start1;
+    temp[id]         = 0;
+    value_type data;
+
+    for (vcl_size_t i = begin; i < end; ++i)
+    {
+      data = static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1])));  //casting to double in order to avoid problems if T is an integer type
+      if (data > temp[id])
+      {
+        index[id] = i;
+        temp[id]  = data;
+      }
+    }
+  }
+  for (vcl_size_t i = 1; i < thread_count; ++i)
+  {
+    if (temp[i] > temp[0])
+    {
+      index[0] = index[i];
+      temp[0] = temp[i];
+    }
+  }
+  return index[0];
+}
+
+/** @brief Computes the maximum of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void max_impl(vector_base<NumericT> const & vec1,
+              ScalarT & result)
+{
+  typedef NumericT       value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t thread_count=1;
+
+#ifdef VIENNACL_WITH_OPENMP
+  if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+      thread_count = omp_get_max_threads();
+#endif
+
+  std::vector<value_type> temp(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  {
+    vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+    id = omp_get_thread_num();
+#endif
+    vcl_size_t begin = (size1 * id) / thread_count;
+    vcl_size_t end   = (size1 * (id + 1)) / thread_count;
+    temp[id]         = data_vec1[start1];
+
+    for (vcl_size_t i = begin; i < end; ++i)
+    {
+      value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in std::min might be expensive, thus 'v' is used for the function
+      temp[id] = std::max<value_type>(temp[id],v);
+    }
+  }
+  for (vcl_size_t i = 1; i < thread_count; ++i)
+    temp[0] = std::max<value_type>( temp[0], temp[i]);
+  result  = temp[0];//Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes the minimum of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void min_impl(vector_base<NumericT> const & vec1,
+              ScalarT & result)
+{
+  typedef NumericT       value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t thread_count=1;
+
+#ifdef VIENNACL_WITH_OPENMP
+  if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+      thread_count = omp_get_max_threads();
+#endif
+
+  std::vector<value_type> temp(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  {
+    vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+    id = omp_get_thread_num();
+#endif
+    vcl_size_t begin = (size1 * id) / thread_count;
+    vcl_size_t end   = (size1 * (id + 1)) / thread_count;
+    temp[id]         = data_vec1[start1];
+
+    for (vcl_size_t i = begin; i < end; ++i)
+    {
+      value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in std::min might be expensive, thus 'v' is used for the function
+      temp[id] = std::min<value_type>(temp[id],v);
+    }
+  }
+  for (vcl_size_t i = 1; i < thread_count; ++i)
+    temp[0] = std::min<value_type>( temp[0], temp[i]);
+  result  = temp[0];//Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes the sum of all elements from the vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void sum_impl(vector_base<NumericT> const & vec1,
+              ScalarT & result)
+{
+  typedef NumericT       value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  value_type temp = 0;
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+:temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(size1); ++i)
+    temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
+
+  result = temp;  //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes a plane rotation of two vectors.
+*
+* Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y)
+*
+* @param vec1   The first vector
+* @param vec2   The second vector
+* @param alpha  The first transformation coefficient
+* @param beta   The second transformation coefficient
+*/
+template<typename NumericT>
+void plane_rotation(vector_base<NumericT> & vec1,
+                    vector_base<NumericT> & vec2,
+                    NumericT alpha, NumericT beta)
+{
+  typedef NumericT  value_type;
+
+  value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(vec2);
+  vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+  value_type data_alpha = alpha;
+  value_type data_beta  = beta;
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(size1); ++i)
+  {
+    value_type temp1 = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
+    value_type temp2 = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2];
+
+    data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha * temp1 + data_beta * temp2;
+    data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = data_alpha * temp2 - data_beta * temp1;
+  }
+}
+
+namespace detail
+{
+  /** @brief Implementation of inclusive_scan and exclusive_scan for the host (OpenMP) backend. */
+  template<typename NumericT>
+  void vector_scan_impl(vector_base<NumericT> const & vec1,
+                        vector_base<NumericT>       & vec2,
+                        bool is_inclusive)
+  {
+    NumericT const * data_vec1 = detail::extract_raw_pointer<NumericT>(vec1);
+    NumericT       * data_vec2 = detail::extract_raw_pointer<NumericT>(vec2);
+
+    vcl_size_t start1 = viennacl::traits::start(vec1);
+    vcl_size_t inc1   = viennacl::traits::stride(vec1);
+    vcl_size_t size1  = viennacl::traits::size(vec1);
+    if (size1 < 1)
+      return;
+
+    vcl_size_t start2 = viennacl::traits::start(vec2);
+    vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+#ifdef VIENNACL_WITH_OPENMP
+    if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+    {
+      std::vector<NumericT> thread_results(omp_get_max_threads());
+
+      // inclusive scan each thread segment:
+      #pragma omp parallel
+      {
+        vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1;
+        vcl_size_t thread_start = work_per_thread * omp_get_thread_num();
+        vcl_size_t thread_stop  = std::min<vcl_size_t>(thread_start + work_per_thread, size1);
+
+        NumericT thread_sum = 0;
+        for(vcl_size_t i = thread_start; i < thread_stop; i++)
+          thread_sum += data_vec1[i * inc1 + start1];
+
+        thread_results[omp_get_thread_num()] = thread_sum;
+      }
+
+      // exclusive-scan of thread results:
+      NumericT current_offset = 0;
+      for (vcl_size_t i=0; i<thread_results.size(); ++i)
+      {
+        NumericT tmp = thread_results[i];
+        thread_results[i] = current_offset;
+        current_offset += tmp;
+      }
+
+      // exclusive/inclusive scan of each segment with correct offset:
+      #pragma omp parallel
+      {
+        vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1;
+        vcl_size_t thread_start = work_per_thread * omp_get_thread_num();
+        vcl_size_t thread_stop  = std::min<vcl_size_t>(thread_start + work_per_thread, size1);
+
+        NumericT thread_sum = thread_results[omp_get_thread_num()];
+        if (is_inclusive)
+        {
+          for(vcl_size_t i = thread_start; i < thread_stop; i++)
+          {
+            thread_sum += data_vec1[i * inc1 + start1];
+            data_vec2[i * inc2 + start2] = thread_sum;
+          }
+        }
+        else
+        {
+          for(vcl_size_t i = thread_start; i < thread_stop; i++)
+          {
+            NumericT tmp = data_vec1[i * inc1 + start1];
+            data_vec2[i * inc2 + start2] = thread_sum;
+            thread_sum += tmp;
+          }
+        }
+      }
+    } else
+#endif
+    {
+      NumericT sum = 0;
+      if (is_inclusive)
+      {
+        for(vcl_size_t i = 0; i < size1; i++)
+        {
+          sum += data_vec1[i * inc1 + start1];
+          data_vec2[i * inc2 + start2] = sum;
+        }
+      }
+      else
+      {
+        for(vcl_size_t i = 0; i < size1; i++)
+        {
+          NumericT tmp = data_vec1[i * inc1 + start1];
+          data_vec2[i * inc2 + start2] = sum;
+          sum += tmp;
+        }
+      }
+    }
+
+  }
+}
+
+/** @brief This function implements an inclusive scan on the host using OpenMP.
+*
+* Given an element vector (x_0, x_1, ..., x_{n-1}),
+* this routine computes (x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-1})
+*
+* @param vec1       Input vector: Gets overwritten by the routine.
+* @param vec2       The output vector. Either idential to vec1 or non-overlapping.
+*/
+template<typename NumericT>
+void inclusive_scan(vector_base<NumericT> const & vec1,
+                    vector_base<NumericT>       & vec2)
+{
+  detail::vector_scan_impl(vec1, vec2, true);
+}
+
+/** @brief This function implements an exclusive scan on the host using OpenMP.
+*
+* Given an element vector (x_0, x_1, ..., x_{n-1}),
+* this routine computes (0, x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-2})
+*
+* @param vec1       Input vector: Gets overwritten by the routine.
+* @param vec2       The output vector. Either idential to vec1 or non-overlapping.
+*/
+template<typename NumericT>
+void exclusive_scan(vector_base<NumericT> const & vec1,
+                    vector_base<NumericT>       & vec2)
+{
+  detail::vector_scan_impl(vec1, vec2, false);
+}
+
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
new file mode 100644
index 0000000..1038b2b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
@@ -0,0 +1,228 @@
+#ifndef VIENNACL_LINALG_ICHOL_HPP_
+#define VIENNACL_LINALG_ICHOL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/ichol.hpp
+  @brief Implementations of incomplete Cholesky factorization preconditioners with static nonzero pattern.
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for incomplete Cholesky factorization with static pattern (ILU0)
+*/
+class ichol0_tag {};
+
+
+/** @brief Implementation of a ILU-preconditioner with static pattern. Optimized version for CSR matrices.
+  *
+  *  Refer to Chih-Jen Lin and Jorge J. Mor�, Incomplete Cholesky Factorizations with Limited Memory, SIAM J. Sci. Comput., 21(1), 24\u201345
+  *  for one of many descriptions of incomplete Cholesky Factorizations
+  *
+  *  @param A       The input matrix in CSR format
+  *  // param tag     An ichol0_tag in order to dispatch among several other preconditioners.
+  */
+template<typename NumericT>
+void precondition(viennacl::compressed_matrix<NumericT> & A, ichol0_tag const & /* tag */)
+{
+  assert( (viennacl::traits::context(A).memory_type() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ICHOL0") );
+
+  NumericT           * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  //std::cout << A.size1() << std::endl;
+  for (vcl_size_t i=0; i<A.size1(); ++i)
+  {
+    unsigned int row_i_begin = row_buffer[i];
+    unsigned int row_i_end   = row_buffer[i+1];
+
+    // get a_ii:
+    NumericT a_ii = 0;
+    for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii)
+    {
+      if (col_buffer[buf_index_aii] == i)
+      {
+        a_ii = std::sqrt(elements[buf_index_aii]);
+        elements[buf_index_aii] = a_ii;
+        break;
+      }
+    }
+
+    // Now scale column/row i, i.e. A(k, i) /= A(i, i)
+    for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii)
+    {
+      if (col_buffer[buf_index_aii] > i)
+        elements[buf_index_aii] /= a_ii;
+    }
+
+    // Now compute A(k, j) -= A(k, i) * A(j, i) for all nonzero k, j in column i:
+    for (unsigned int buf_index_j = row_i_begin; buf_index_j < row_i_end; ++buf_index_j)
+    {
+      unsigned int j = col_buffer[buf_index_j];
+      if (j <= i)
+        continue;
+
+      NumericT a_ji = elements[buf_index_j];
+
+      for (unsigned int buf_index_k = row_i_begin; buf_index_k < row_i_end; ++buf_index_k)
+      {
+        unsigned int k = col_buffer[buf_index_k];
+        if (k < j)
+          continue;
+
+        NumericT a_ki = elements[buf_index_k];
+
+        //Now check whether A(k, j) is in nonzero pattern:
+        unsigned int row_j_begin = row_buffer[j];
+        unsigned int row_j_end   = row_buffer[j+1];
+        for (unsigned int buf_index_kj = row_j_begin; buf_index_kj < row_j_end; ++buf_index_kj)
+        {
+          if (col_buffer[buf_index_kj] == k)
+          {
+            elements[buf_index_kj] -= a_ki * a_ji;
+            break;
+          }
+        }
+      }
+    }
+
+  }
+
+}
+
+
+/** @brief Incomplete Cholesky preconditioner class with static pattern (ICHOL0), can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class ichol0_precond
+{
+  typedef typename MatrixT::value_type      NumericType;
+
+public:
+  ichol0_precond(MatrixT const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::context(viennacl::MAIN_MEMORY))
+  {
+      //initialize preconditioner:
+      //std::cout << "Start CPU precond" << std::endl;
+      init(mat);
+      //std::cout << "End CPU precond" << std::endl;
+  }
+
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle1());
+    unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle2());
+    NumericType  const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(LLT.handle());
+
+    // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A.
+    viennacl::linalg::host_based::detail::csr_trans_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LLT.size2(), lower_tag());
+    viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LLT.size2(), upper_tag());
+  }
+
+private:
+  void init(MatrixT const & mat)
+  {
+    viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+    viennacl::switch_memory_context(LLT, host_ctx);
+
+    viennacl::copy(mat, LLT);
+    viennacl::linalg::precondition(LLT, tag_);
+  }
+
+  ichol0_tag const & tag_;
+  viennacl::compressed_matrix<NumericType> LLT;
+};
+
+
+/** @brief ILU0 preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class ichol0_precond< compressed_matrix<NumericT, AlignmentV> >
+{
+  typedef compressed_matrix<NumericT, AlignmentV>   MatrixType;
+
+public:
+  ichol0_precond(MatrixType const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::traits::context(mat))
+  {
+    //initialize preconditioner:
+    //std::cout << "Start GPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End GPU precond" << std::endl;
+  }
+
+  void apply(vector<NumericT> & vec) const
+  {
+    if (viennacl::traits::context(vec).memory_type() != viennacl::MAIN_MEMORY)
+    {
+      viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+      viennacl::context old_ctx = viennacl::traits::context(vec);
+
+      viennacl::switch_memory_context(vec, host_ctx);
+      viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag());
+      viennacl::linalg::inplace_solve(      LLT , vec, upper_tag());
+      viennacl::switch_memory_context(vec, old_ctx);
+    }
+    else //apply ILU0 directly:
+    {
+      // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A.
+      viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag());
+      viennacl::linalg::inplace_solve(      LLT , vec, upper_tag());
+    }
+  }
+
+private:
+  void init(MatrixType const & mat)
+  {
+    viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+    viennacl::switch_memory_context(LLT, host_ctx);
+    LLT = mat;
+
+    viennacl::linalg::precondition(LLT, tag_);
+  }
+
+  ichol0_tag const & tag_;
+  viennacl::compressed_matrix<NumericT> LLT;
+};
+
+}
+}
+
+
+
+
+#endif
+
+
+


[05/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/svd.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/svd.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/svd.hpp
new file mode 100644
index 0000000..0b93bb8
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/svd.hpp
@@ -0,0 +1,703 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SVD_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_SVD_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/svd.hpp
+ *  @brief OpenCL kernel file for singular value decomposition */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+template <typename StringType>
+void generate_svd_bidiag_pack(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+  source.append("__kernel void bidiag_pack(__global "); source.append(numeric_string); source.append("* A, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("* D, \n");
+  source.append("  __global "); source.append(numeric_string); source.append("* S, \n");
+  source.append("  uint size1, \n");
+  source.append("  uint size2, \n");
+  source.append("  uint stride \n");
+  source.append(") { \n");
+  source.append("  uint size = min(size1, size2); \n");
+
+  source.append("  if(get_global_id(0) == 0) \n");
+  source.append("    S[0] = 0; \n");
+  if(is_row_major)
+    {
+      source.append("  for(uint i = get_global_id(0); i < size ; i += get_global_size(0)) { \n");
+      source.append("    D[i] = A[i*stride + i]; \n");
+      source.append("    S[i + 1] = (i + 1 < size2) ? A[i*stride + (i + 1)] : 0; \n");
+    }
+  else
+    {
+      source.append("  for(uint i = get_global_id(0); i < size ; i += get_global_size(0)) { \n");
+      source.append("    D[i] = A[i*stride + i]; \n");
+      source.append("    S[i + 1] = (i + 1 < size2) ? A[i + (i + 1) * stride] : 0; \n");
+    }
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_svd_col_reduce_lcl_array(StringT & source, std::string const & numeric_string)
+{
+  // calculates a sum of local array elements
+  source.append("void col_reduce_lcl_array(__local "); source.append(numeric_string); source.append("* sums, uint lcl_id, uint lcl_sz) { \n");
+  source.append("    uint step = lcl_sz >> 1; \n");
+
+  source.append("    while (step > 0) { \n");
+  source.append("        if (lcl_id < step) { \n");
+  source.append("            sums[lcl_id] += sums[lcl_id + step]; \n");
+  source.append("        } \n");
+  source.append("        step >>= 1; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    } \n");
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_svd_copy_col(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+  // probably, this is a ugly way
+  source.append("__kernel void copy_col(__global "); source.append(numeric_string); source.append("* A, \n");
+  source.append("                       __global "); source.append(numeric_string); source.append("* V, \n");
+  source.append("                       uint row_start, \n");
+  source.append("                       uint col_start, \n");
+  source.append("                       uint size, \n");
+  source.append("                       uint stride \n");
+  source.append("                       ) { \n");
+  source.append("    uint glb_id = get_global_id(0); \n");
+  source.append("    uint glb_sz = get_global_size(0); \n");
+  if(is_row_major)
+    {
+      source.append("    for(uint i = row_start + glb_id; i < size; i += glb_sz) { \n");
+      source.append("        V[i - row_start] = A[i * stride + col_start]; \n");
+      source.append("    } \n");
+    }
+  else
+    {
+      source.append("    for(uint i = row_start + glb_id; i < size; i += glb_sz) { \n");
+      source.append("        V[i - row_start] = A[i + col_start * stride]; \n");
+      source.append("    } \n");
+    }
+
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_svd_copy_row(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+  // probably, this is too
+  source.append("__kernel void copy_row(__global "); source.append(numeric_string); source.append("* A, \n");
+  source.append("                       __global "); source.append(numeric_string); source.append("* V, \n");
+  source.append("                       uint row_start, \n");
+  source.append("                       uint col_start, \n");
+  source.append("                       uint size, \n");
+  source.append("                       uint stride \n");
+  source.append("                       ) { \n");
+  source.append("    uint glb_id = get_global_id(0); \n");
+  source.append("    uint glb_sz = get_global_size(0); \n");
+  if(is_row_major)
+    {
+      source.append("    for(uint i = col_start + glb_id; i < size; i += glb_sz) { \n");
+      source.append("        V[i - col_start] = A[row_start * stride + i]; \n");
+      source.append("    } \n");
+    }
+  else
+    {
+      source.append("    for(uint i = col_start + glb_id; i < size; i += glb_sz) { \n");
+      source.append("        V[i - col_start] = A[row_start + i * stride]; \n");
+      source.append("    } \n");
+    }
+
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_svd_final_iter_update(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void final_iter_update(__global "); source.append(numeric_string); source.append("* A, \n");
+  source.append("                                uint stride, \n");
+  source.append("                                uint n, \n");
+  source.append("                                uint last_n, \n");
+  source.append("                                "); source.append(numeric_string); source.append(" q, \n");
+  source.append("                                "); source.append(numeric_string); source.append(" p \n");
+  source.append("                                ) \n");
+  source.append("{ \n");
+  source.append("    uint glb_id = get_global_id(0); \n");
+  source.append("    uint glb_sz = get_global_size(0); \n");
+
+  source.append("    for (uint px = glb_id; px < last_n; px += glb_sz) \n");
+  source.append("    { \n");
+  source.append("        "); source.append(numeric_string); source.append(" v_in = A[n * stride + px]; \n");
+  source.append("        "); source.append(numeric_string); source.append(" z = A[(n - 1) * stride + px]; \n");
+  source.append("        A[(n - 1) * stride + px] = q * z + p * v_in; \n");
+  source.append("        A[n * stride + px] = q * v_in - p * z; \n");
+  source.append("    } \n");
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_svd_givens_next(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+  source.append("__kernel void givens_next(__global "); source.append(numeric_string); source.append("* matr, \n");
+  source.append("                            __global "); source.append(numeric_string); source.append("* cs, \n");
+  source.append("                            __global "); source.append(numeric_string); source.append("* ss, \n");
+  source.append("                            uint size, \n");
+  source.append("                            uint stride, \n");
+  source.append("                            uint start_i, \n");
+  source.append("                            uint end_i \n");
+  source.append("                            ) \n");
+  source.append("{ \n");
+  source.append("    uint glb_id = get_global_id(0); \n");
+  source.append("    uint glb_sz = get_global_size(0); \n");
+
+  source.append("    uint lcl_id = get_local_id(0); \n");
+  source.append("    uint lcl_sz = get_local_size(0); \n");
+
+  source.append("    uint j = glb_id; \n");
+
+  source.append("    __local "); source.append(numeric_string); source.append(" cs_lcl[256]; \n");
+  source.append("    __local "); source.append(numeric_string); source.append(" ss_lcl[256]; \n");
+  if(is_row_major)
+    {
+
+      source.append("    "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(end_i + 1) + j * stride] : 0; \n");
+
+      source.append("    uint elems_num = end_i - start_i + 1; \n");
+      source.append("    uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n");
+
+      source.append("    for(uint block_id = 0; block_id < block_num; block_id++) \n");
+      source.append("    { \n");
+      source.append("        uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n");
+
+      source.append("        if(lcl_id < to) \n");
+      source.append("        { \n");
+      source.append("            cs_lcl[lcl_id] = cs[end_i - (lcl_id + block_id * lcl_sz)]; \n");
+      source.append("            ss_lcl[lcl_id] = ss[end_i - (lcl_id + block_id * lcl_sz)]; \n");
+      source.append("        } \n");
+
+      source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      source.append("        if(j < size) \n");
+      source.append("        { \n");
+      source.append("            for(uint ind = 0; ind < to; ind++) \n");
+      source.append("            { \n");
+      source.append("                uint i = end_i - (ind + block_id * lcl_sz); \n");
+
+      source.append("                "); source.append(numeric_string); source.append(" z = matr[i + j * stride]; \n");
+
+      source.append("                "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind]; \n");
+      source.append("                "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind]; \n");
+
+      source.append("                matr[(i + 1) + j * stride] = x * cs_val + z * ss_val; \n");
+      source.append("                x = -x * ss_val + z * cs_val; \n");
+      source.append("            } \n");
+      source.append("        } \n");
+      source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+      source.append("    } \n");
+      source.append("    if(j < size) \n");
+      source.append("        matr[(start_i) + j * stride] = x; \n");
+    }
+  else
+    {
+
+      source.append("    "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(end_i + 1) * stride + j] : 0; \n");
+
+      source.append("    uint elems_num = end_i - start_i + 1; \n");
+      source.append("    uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n");
+
+      source.append("    for(uint block_id = 0; block_id < block_num; block_id++) \n");
+      source.append("    { \n");
+      source.append("        uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n");
+
+      source.append("        if(lcl_id < to) \n");
+      source.append("        { \n");
+      source.append("            cs_lcl[lcl_id] = cs[end_i - (lcl_id + block_id * lcl_sz)]; \n");
+      source.append("            ss_lcl[lcl_id] = ss[end_i - (lcl_id + block_id * lcl_sz)]; \n");
+      source.append("        } \n");
+
+      source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      source.append("        if(j < size) \n");
+      source.append("        { \n");
+      source.append("            for(uint ind = 0; ind < to; ind++) \n");
+      source.append("            { \n");
+      source.append("                uint i = end_i - (ind + block_id * lcl_sz); \n");
+
+      source.append("                "); source.append(numeric_string); source.append(" z = matr[i * stride + j]; \n");
+
+      source.append("                "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind]; \n");
+      source.append("                "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind]; \n");
+
+      source.append("                matr[(i + 1) * stride + j] = x * cs_val + z * ss_val; \n");
+      source.append("                x = -x * ss_val + z * cs_val; \n");
+      source.append("            } \n");
+      source.append("        } \n");
+      source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+      source.append("    } \n");
+      source.append("    if(j < size) \n");
+      source.append("        matr[(start_i) * stride + j] = x; \n");
+    }
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_svd_givens_prev(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void givens_prev(__global "); source.append(numeric_string); source.append("* matr, \n");
+  source.append("                            __global "); source.append(numeric_string); source.append("* cs, \n");
+  source.append("                            __global "); source.append(numeric_string); source.append("* ss, \n");
+  source.append("                            uint size, \n");
+  source.append("                            uint stride, \n");
+  source.append("                            uint start_i, \n");
+  source.append("                            uint end_i \n");
+  source.append("                            ) \n");
+  source.append("{ \n");
+  source.append("    uint glb_id = get_global_id(0); \n");
+  source.append("    uint glb_sz = get_global_size(0); \n");
+
+  source.append("    uint lcl_id = get_local_id(0); \n");
+  source.append("    uint lcl_sz = get_local_size(0); \n");
+
+  source.append("    uint j = glb_id; \n");
+
+  source.append("    __local "); source.append(numeric_string); source.append(" cs_lcl[256]; \n");
+  source.append("    __local "); source.append(numeric_string); source.append(" ss_lcl[256]; \n");
+
+  source.append("    "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(start_i - 1) * stride + j] : 0; \n");
+
+  source.append("    uint elems_num = end_i - start_i; \n");
+  source.append("    uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n");
+
+  source.append("    for (uint block_id = 0; block_id < block_num; block_id++) \n");
+  source.append("    { \n");
+  source.append("        uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n");
+
+  source.append("        if (lcl_id < to) \n");
+  source.append("        { \n");
+  source.append("            cs_lcl[lcl_id] = cs[lcl_id + start_i + block_id * lcl_sz]; \n");
+  source.append("            ss_lcl[lcl_id] = ss[lcl_id + start_i + block_id * lcl_sz]; \n");
+  source.append("        } \n");
+
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("        if (j < size) \n");
+  source.append("        { \n");
+  source.append("            for (uint ind = 0; ind < to; ind++) \n");
+  source.append("            { \n");
+  source.append("                uint i = ind + start_i + block_id * lcl_sz; \n");
+
+  source.append("                "); source.append(numeric_string); source.append(" z = matr[i * stride + j]; \n");
+
+  source.append("                "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind];//cs[i]; \n");
+  source.append("                "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind];//ss[i]; \n");
+
+  source.append("                matr[(i - 1) * stride + j] = x * cs_val + z * ss_val; \n");
+  source.append("                x = -x * ss_val + z * cs_val; \n");
+  source.append("            } \n");
+  source.append("        } \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    } \n");
+  source.append("    if (j < size) \n");
+  source.append("        matr[(end_i - 1) * stride + j] = x; \n");
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_svd_house_update_A_left(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+  source.append("__kernel void house_update_A_left( \n");
+  source.append("                        __global "); source.append(numeric_string); source.append("* A, \n");
+  source.append("                        __constant "); source.append(numeric_string); source.append("* V, \n"); //householder vector
+  source.append("                        uint row_start, \n");
+  source.append("                        uint col_start, \n");
+  source.append("                        uint size1, \n");
+  source.append("                        uint size2, \n");
+  source.append("                        uint stride, \n");
+  source.append("                        __local "); source.append(numeric_string); source.append("* sums \n");
+  source.append("                        ) { \n");
+  source.append("    uint glb_id = get_global_id(0); \n");
+  source.append("    uint glb_sz = get_global_size(0); \n");
+
+  source.append("    uint grp_id = get_group_id(0); \n");
+  source.append("    uint grp_nm = get_num_groups(0); \n");
+
+  source.append("    uint lcl_id = get_local_id(0); \n");
+  source.append("    uint lcl_sz = get_local_size(0); \n");
+
+  source.append("    "); source.append(numeric_string); source.append(" ss = 0; \n");
+
+      // doing it in slightly different way to avoid cache misses
+  if(is_row_major)
+    {
+      source.append("    for(uint i = glb_id + col_start; i < size2; i += glb_sz) { \n");
+      source.append("        ss = 0; \n");
+      source.append("        for(uint j = row_start; j < size1; j++) ss = ss + (V[j] * A[j * stride + i]); \n");
+
+      source.append("        for(uint j = row_start; j < size1; j++) \n");
+      source.append("            A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss); \n");
+      source.append("    } \n");
+    }
+  else
+    {
+      source.append("    for(uint i = glb_id + col_start; i < size2; i += glb_sz) { \n");
+      source.append("        ss = 0; \n");
+      source.append("        for(uint j = row_start; j < size1; j++) ss = ss + (V[j] * A[j + i * stride]); \n");
+
+      source.append("        for(uint j = row_start; j < size1; j++) \n");
+      source.append("            A[j + i * stride] = A[j + i * stride] - (2 * V[j] * ss); \n");
+      source.append("    } \n");
+    }
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_svd_house_update_A_right(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+
+  source.append("__kernel void house_update_A_right( \n");
+  source.append("                        __global "); source.append(numeric_string); source.append("* A, \n");
+  source.append("                        __global "); source.append(numeric_string); source.append("* V, \n"); // householder vector
+  source.append("                        uint row_start, \n");
+  source.append("                        uint col_start, \n");
+  source.append("                        uint size1, \n");
+  source.append("                        uint size2, \n");
+  source.append("                        uint stride, \n");
+  source.append("                        __local "); source.append(numeric_string); source.append("* sums \n");
+  source.append("                        ) { \n");
+
+  source.append("    uint glb_id = get_global_id(0); \n");
+
+  source.append("    uint grp_id = get_group_id(0); \n");
+  source.append("    uint grp_nm = get_num_groups(0); \n");
+
+  source.append("    uint lcl_id = get_local_id(0); \n");
+  source.append("    uint lcl_sz = get_local_size(0); \n");
+
+  source.append("    "); source.append(numeric_string); source.append(" ss = 0; \n");
+
+      // update of A matrix
+  if(is_row_major)
+    {
+      source.append("    for(uint i = grp_id + row_start; i < size1; i += grp_nm) { \n");
+      source.append("        ss = 0; \n");
+
+      source.append("        for(uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * A[i * stride + j]); \n");
+      source.append("        sums[lcl_id] = ss; \n");
+
+      source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+      source.append("        col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
+      source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      source.append("        "); source.append(numeric_string); source.append(" sum_Av = sums[0]; \n");
+
+      source.append("        for(uint j = lcl_id; j < size2; j += lcl_sz) \n");
+      source.append("            A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av); \n");
+      source.append("    } \n");
+    }
+  else
+    {
+      source.append("    for(uint i = grp_id + row_start; i < size1; i += grp_nm) { \n");
+      source.append("        ss = 0; \n");
+
+      source.append("        for(uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * A[i + j * stride]); \n");
+      source.append("        sums[lcl_id] = ss; \n");
+
+      source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+      source.append("        col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
+      source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      source.append("        "); source.append(numeric_string); source.append(" sum_Av = sums[0]; \n");
+
+      source.append("        for(uint j = lcl_id; j < size2; j += lcl_sz) \n");
+      source.append("            A[i + j * stride] = A[i + j * stride] - (2 * V[j] * sum_Av); \n");
+      source.append("    } \n");
+    }
+
+  source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_svd_house_update_QL(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+  source.append("__kernel void house_update_QL(\n");
+  source.append("                        __global "); source.append(numeric_string); source.append("* QL, \n");
+  source.append("                        __constant "); source.append(numeric_string); source.append("* V, \n"); //householder vector
+  source.append("                        uint size1, \n");
+  source.append("                        uint strideQ, \n");
+  source.append("                        __local "); source.append(numeric_string); source.append("* sums \n");
+  source.append("                        ) { \n");
+  source.append("    uint glb_id = get_global_id(0); \n");
+  source.append("    uint glb_sz = get_global_size(0); \n");
+
+  source.append("    uint grp_id = get_group_id(0); \n");
+  source.append("    uint grp_nm = get_num_groups(0); \n");
+
+  source.append("    uint lcl_id = get_local_id(0); \n");
+  source.append("    uint lcl_sz = get_local_size(0); \n");
+
+  source.append("    "); source.append(numeric_string); source.append(" ss = 0; \n");
+
+  if(is_row_major)
+    {
+      source.append("    for(uint i = grp_id; i < size1; i += grp_nm) { \n");
+      source.append("        ss = 0; \n");
+      source.append("        for(uint j = lcl_id; j < size1; j += lcl_sz) ss = ss + (V[j] * QL[i * strideQ + j]); \n");
+      source.append("        sums[lcl_id] = ss; \n");
+
+      source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+      source.append("        col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
+      source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      source.append("        "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n");
+
+      source.append("        for(uint j = lcl_id; j < size1; j += lcl_sz) \n");
+      source.append("            QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv); \n");
+      source.append("    } \n");
+    }
+  else
+    {
+      source.append("    for(uint i = grp_id; i < size1; i += grp_nm) { \n");
+      source.append("        ss = 0; \n");
+      source.append("        for(uint j = lcl_id; j < size1; j += lcl_sz) ss = ss + (V[j] * QL[i + j * strideQ]); \n");
+      source.append("        sums[lcl_id] = ss; \n");
+
+      source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+      source.append("        col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
+      source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      source.append("        "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n");
+
+      source.append("        for(uint j = lcl_id; j < size1; j += lcl_sz) \n");
+      source.append("            QL[i + j * strideQ] = QL[i + j * strideQ] - (2 * V[j] * sum_Qv); \n");
+      source.append("    } \n");
+    }
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_svd_house_update_QR(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void house_update_QR( \n");
+  source.append("                        __global "); source.append(numeric_string); source.append("* QR, \n");
+  source.append("                        __global "); source.append(numeric_string); source.append("* V, \n"); // householder vector
+  source.append("                        uint size1, \n");
+  source.append("                        uint size2, \n");
+  source.append("                        uint strideQ, \n");
+  source.append("                        __local "); source.append(numeric_string); source.append("* sums \n");
+  source.append("                        ) { \n");
+
+  source.append("    uint glb_id = get_global_id(0); \n");
+
+  source.append("    uint grp_id = get_group_id(0); \n");
+  source.append("    uint grp_nm = get_num_groups(0); \n");
+
+  source.append("    uint lcl_id = get_local_id(0); \n");
+  source.append("    uint lcl_sz = get_local_size(0); \n");
+
+  source.append("   "); source.append(numeric_string); source.append(" ss = 0; \n");
+
+      // update of QR matrix
+      // Actually, we are calculating a transpose of right matrix. This allows to avoid cache
+      // misses.
+  source.append("    for (uint i = grp_id; i < size2; i += grp_nm) { \n");
+  source.append("        ss = 0; \n");
+  source.append("        for (uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * QR[i * strideQ + j]); \n");
+  source.append("        sums[lcl_id] = ss; \n");
+
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("        "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n");
+  source.append("        for (uint j = lcl_id; j < size2; j += lcl_sz) \n");
+  source.append("            QR[i * strideQ + j] = QR[i * strideQ + j] - (2 * V[j] * sum_Qv); \n");
+  source.append("    } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_svd_inverse_signs(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void inverse_signs(__global "); source.append(numeric_string); source.append("* v, \n");
+  source.append("                            __global "); source.append(numeric_string); source.append("* signs, \n");
+  source.append("                            uint size, \n");
+  source.append("                            uint stride \n");
+  source.append("                            ) \n");
+  source.append("{ \n");
+  source.append("    uint glb_id_x = get_global_id(0); \n");
+  source.append("    uint glb_id_y = get_global_id(1); \n");
+
+  source.append("    if ((glb_id_x < size) && (glb_id_y < size)) \n");
+  source.append("        v[glb_id_x * stride + glb_id_y] *= signs[glb_id_x]; \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_svd_transpose_inplace(StringT & source, std::string const & numeric_string)
+{
+
+  source.append("__kernel void transpose_inplace(__global "); source.append(numeric_string); source.append("* input, \n");
+  source.append("                        unsigned int row_num, \n");
+  source.append("                        unsigned int col_num) { \n");
+  source.append("    unsigned int size = row_num * col_num; \n");
+  source.append("    for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
+  source.append("        unsigned int row = i / col_num; \n");
+  source.append("        unsigned int col = i - row*col_num; \n");
+
+  source.append("        unsigned int new_pos = col * row_num + row; \n");
+
+          //new_pos = (col < row) ? 0 : 1;
+          //input[i] = new_pos;
+
+  source.append("        if (i < new_pos) { \n");
+  source.append("            "); source.append(numeric_string); source.append(" val = input[i]; \n");
+  source.append("            input[i] = input[new_pos]; \n");
+  source.append("            input[new_pos] = val; \n");
+  source.append("        } \n");
+  source.append("    } \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_svd_update_qr_column(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void update_qr_column(__global "); source.append(numeric_string); source.append("* A, \n");
+  source.append("                               uint stride, \n");
+  source.append("                               __global "); source.append(numeric_string); source.append("* buf, \n");
+  source.append("                               int m, \n");
+  source.append("                               int n, \n");
+  source.append("                               int last_n) \n");
+  source.append("{ \n");
+  source.append("    uint glb_id = get_global_id(0); \n");
+  source.append("    uint glb_sz = get_global_size(0); \n");
+
+  source.append("    for (int i = glb_id; i < last_n; i += glb_sz) \n");
+  source.append("    { \n");
+  source.append("        "); source.append(numeric_string); source.append(" a_ik = A[m * stride + i], a_ik_1, a_ik_2; \n");
+
+  source.append("        a_ik_1 = A[(m + 1) * stride + i]; \n");
+
+  source.append("        for (int k = m; k < n; k++) \n");
+  source.append("        { \n");
+  source.append("            bool notlast = (k != n - 1); \n");
+
+  source.append("            "); source.append(numeric_string); source.append(" p = buf[5 * k] * a_ik + buf[5 * k + 1] * a_ik_1; \n");
+
+  source.append("            if (notlast) \n");
+  source.append("            { \n");
+  source.append("                a_ik_2 = A[(k + 2) * stride + i]; \n");
+  source.append("                p = p + buf[5 * k + 2] * a_ik_2; \n");
+  source.append("                a_ik_2 = a_ik_2 - p * buf[5 * k + 4]; \n");
+  source.append("            } \n");
+
+  source.append("            A[k * stride + i] = a_ik - p; \n");
+  source.append("            a_ik_1 = a_ik_1 - p * buf[5 * k + 3]; \n");
+
+  source.append("            a_ik = a_ik_1; \n");
+  source.append("            a_ik_1 = a_ik_2; \n");
+  source.append("        } \n");
+
+  source.append("        A[n * stride + i] = a_ik; \n");
+  source.append("    } \n");
+
+  source.append("} \n");
+}
+
+
+
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for singular value decomposition of dense matrices. */
+template<typename NumericT, typename MatrixLayout = row_major>
+struct svd
+{
+  static std::string program_name()
+  {
+    bool is_row = viennacl::is_row_major<MatrixLayout>::value;
+    return (viennacl::ocl::type_to_string<NumericT>::apply() + "_svd_") + (is_row ? "row" : "col");
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+      bool is_row_major = viennacl::is_row_major<MatrixLayout>::value;
+
+      std::string source;
+      source.reserve(1024);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // only generate for floating points (forces error for integers)
+      if (numeric_string == "float" || numeric_string == "double")
+      {
+        //helper function used by multiple kernels:
+        generate_svd_col_reduce_lcl_array(source, numeric_string);
+
+        //kernels:
+        generate_svd_bidiag_pack(source, numeric_string, is_row_major);
+        generate_svd_copy_col(source, numeric_string, is_row_major);
+        generate_svd_copy_row(source, numeric_string, is_row_major);
+        generate_svd_final_iter_update(source, numeric_string);
+        generate_svd_givens_next(source, numeric_string, is_row_major);
+        generate_svd_givens_prev(source, numeric_string);
+        generate_svd_house_update_A_left(source, numeric_string, is_row_major);
+        generate_svd_house_update_A_right(source, numeric_string, is_row_major);
+        generate_svd_house_update_QL(source, numeric_string, is_row_major);
+        generate_svd_house_update_QR(source, numeric_string);
+        generate_svd_inverse_signs(source, numeric_string);
+        generate_svd_transpose_inplace(source, numeric_string);
+        generate_svd_update_qr_column(source, numeric_string);
+      }
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector.hpp
new file mode 100644
index 0000000..b6a2b7d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector.hpp
@@ -0,0 +1,867 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/scheduler/io.hpp"
+#include "viennacl/scheduler/preset.hpp"
+
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+
+
+/** @file viennacl/linalg/opencl/kernels/vector.hpp
+ *  @brief OpenCL kernel file for vector operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+/** @brief Enumeration for the scalar type in avbv-like operations */
+enum avbv_scalar_type
+{
+  VIENNACL_AVBV_NONE = 0, // vector does not exist/contribute
+  VIENNACL_AVBV_CPU,
+  VIENNACL_AVBV_GPU
+};
+
+/** @brief Configuration struct for generating OpenCL kernels for linear combinations of vectors */
+struct avbv_config
+{
+  avbv_config() : with_stride_and_range(true), a(VIENNACL_AVBV_CPU), b(VIENNACL_AVBV_NONE) {}
+
+  bool with_stride_and_range;
+  std::string      assign_op;
+  avbv_scalar_type a;
+  avbv_scalar_type b;
+};
+
+// just returns the for-loop
+template <typename StringType>
+void generate_avbv_impl2(StringType & source, std::string const & /*numeric_string*/, avbv_config const & cfg, bool mult_alpha, bool mult_beta)
+{
+  source.append("    for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n");
+  if (cfg.with_stride_and_range)
+  {
+    source.append("      vec1[i*size1.y+size1.x] "); source.append(cfg.assign_op); source.append(" vec2[i*size2.y+size2.x] ");
+    if (mult_alpha)
+      source.append("* alpha ");
+    else
+      source.append("/ alpha ");
+    if (cfg.b != VIENNACL_AVBV_NONE)
+    {
+      source.append("+ vec3[i*size3.y+size3.x] ");
+      if (mult_beta)
+        source.append("* beta");
+      else
+        source.append("/ beta");
+    }
+  }
+  else
+  {
+    source.append("    vec1[i] "); source.append(cfg.assign_op); source.append(" vec2[i] ");
+    if (mult_alpha)
+      source.append("* alpha ");
+    else
+      source.append("/ alpha ");
+    if (cfg.b != VIENNACL_AVBV_NONE)
+    {
+      source.append("+ vec3[i] ");
+      if (mult_beta)
+        source.append("* beta");
+      else
+        source.append("/ beta");
+    }
+  }
+  source.append("; \n");
+}
+
+template <typename StringType>
+void generate_avbv_impl(StringType & source, std::string const & numeric_string, avbv_config const & cfg)
+{
+  source.append("__kernel void av");
+  if (cfg.b != VIENNACL_AVBV_NONE)
+    source.append("bv");
+  if (cfg.assign_op != "=")
+    source.append("_v");
+
+  if (cfg.a == VIENNACL_AVBV_CPU)
+    source.append("_cpu");
+  else if (cfg.a == VIENNACL_AVBV_GPU)
+    source.append("_gpu");
+
+  if (cfg.b == VIENNACL_AVBV_CPU)
+    source.append("_cpu");
+  else if (cfg.b == VIENNACL_AVBV_GPU)
+    source.append("_gpu");
+  source.append("( \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vec1, \n");
+  source.append("  uint4 size1, \n");
+  source.append(" \n");
+  if (cfg.a == VIENNACL_AVBV_CPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" fac2, \n");
+  }
+  else if (cfg.a == VIENNACL_AVBV_GPU)
+  {
+    source.append("  __global "); source.append(numeric_string); source.append(" * fac2, \n");
+  }
+  source.append("  unsigned int options2, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+  source.append("  __global const "); source.append(numeric_string); source.append(" * vec2, \n");
+  source.append("  uint4 size2");
+
+  if (cfg.b != VIENNACL_AVBV_NONE)
+  {
+    source.append(", \n\n");
+    if (cfg.b == VIENNACL_AVBV_CPU)
+    {
+      source.append("  "); source.append(numeric_string); source.append(" fac3, \n");
+    }
+    else if (cfg.b == VIENNACL_AVBV_GPU)
+    {
+      source.append("  __global "); source.append(numeric_string); source.append(" * fac3, \n");
+    }
+    source.append("  unsigned int options3, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+    source.append("  __global const "); source.append(numeric_string); source.append(" * vec3, \n");
+    source.append("  uint4 size3 \n");
+  }
+  source.append(") { \n");
+
+  if (cfg.a == VIENNACL_AVBV_CPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" alpha = fac2; \n");
+  }
+  else if (cfg.a == VIENNACL_AVBV_GPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
+  }
+  source.append("  if (options2 & (1 << 0)) \n");
+  source.append("    alpha = -alpha; \n");
+  source.append(" \n");
+
+  if (cfg.b == VIENNACL_AVBV_CPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" beta = fac3; \n");
+  }
+  else if (cfg.b == VIENNACL_AVBV_GPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
+  }
+  if (cfg.b != VIENNACL_AVBV_NONE)
+  {
+    source.append("  if (options3 & (1 << 0)) \n");
+    source.append("    beta = -beta; \n");
+    source.append(" \n");
+  }
+  source.append("  if (options2 & (1 << 1)) { \n");
+  if (cfg.b != VIENNACL_AVBV_NONE)
+  {
+    source.append("    if (options3 & (1 << 1)) {\n");
+    generate_avbv_impl2(source, numeric_string, cfg, false, false);
+    source.append("    } else {\n");
+    generate_avbv_impl2(source, numeric_string, cfg, false, true);
+    source.append("    } \n");
+  }
+  else
+    generate_avbv_impl2(source, numeric_string, cfg, false, true);
+  source.append("  } else { \n");
+  if (cfg.b != VIENNACL_AVBV_NONE)
+  {
+    source.append("    if (options3 & (1 << 1)) {\n");
+    generate_avbv_impl2(source, numeric_string, cfg, true, false);
+    source.append("    } else {\n");
+    generate_avbv_impl2(source, numeric_string, cfg, true, true);
+    source.append("    } \n");
+  }
+  else
+    generate_avbv_impl2(source, numeric_string, cfg, true, true);
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_avbv(StringType & source, std::string const & numeric_string)
+{
+  avbv_config cfg;
+  cfg.assign_op = "=";
+  cfg.with_stride_and_range = true;
+
+  // av
+  cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+  cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+
+  // avbv
+  cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+
+  // avbv
+  cfg.assign_op = "+=";
+
+  cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+}
+
+template <typename StringType>
+void generate_plane_rotation(StringType & source, std::string const & numeric_string)
+{
+  source.append("__kernel void plane_rotation( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
+  source.append("          unsigned int start1, \n");
+  source.append("          unsigned int inc1, \n");
+  source.append("          unsigned int size1, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * vec2, \n");
+  source.append("          unsigned int start2, \n");
+  source.append("          unsigned int inc2, \n");
+  source.append("          unsigned int size2, \n");
+  source.append("          "); source.append(numeric_string); source.append(" alpha, \n");
+  source.append("          "); source.append(numeric_string); source.append(" beta) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" tmp1 = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" tmp2 = 0; \n");
+  source.append(" \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+  source.append(" { \n");
+  source.append("    tmp1 = vec1[i*inc1+start1]; \n");
+  source.append("    tmp2 = vec2[i*inc2+start2]; \n");
+  source.append(" \n");
+  source.append("    vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2; \n");
+  source.append("    vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1; \n");
+  source.append("  } \n");
+  source.append(" \n");
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_vector_swap(StringType & source, std::string const & numeric_string)
+{
+  source.append("__kernel void swap( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
+  source.append("          unsigned int start1, \n");
+  source.append("          unsigned int inc1, \n");
+  source.append("          unsigned int size1, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * vec2, \n");
+  source.append("          unsigned int start2, \n");
+  source.append("          unsigned int inc2, \n");
+  source.append("          unsigned int size2 \n");
+  source.append("          ) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" tmp; \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    tmp = vec2[i*inc2+start2]; \n");
+  source.append("    vec2[i*inc2+start2] = vec1[i*inc1+start1]; \n");
+  source.append("    vec1[i*inc1+start1] = tmp; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_assign_cpu(StringType & source, std::string const & numeric_string)
+{
+  source.append("__kernel void assign_cpu( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
+  source.append("          unsigned int start1, \n");
+  source.append("          unsigned int inc1, \n");
+  source.append("          unsigned int size1, \n");
+  source.append("          unsigned int internal_size1, \n");
+  source.append("          "); source.append(numeric_string); source.append(" alpha) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < internal_size1; i += get_global_size(0)) \n");
+  source.append("    vec1[i*inc1+start1] = (i < size1) ? alpha : 0; \n");
+  source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_inner_prod(StringType & source, std::string const & numeric_string, vcl_size_t vector_num)
+{
+  std::stringstream ss;
+  ss << vector_num;
+  std::string vector_num_string = ss.str();
+
+  source.append("__kernel void inner_prod"); source.append(vector_num_string); source.append("( \n");
+  source.append("          __global const "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("          uint4 params_x, \n");
+  for (vcl_size_t i=0; i<vector_num; ++i)
+  {
+    ss.str("");
+    ss << i;
+    source.append("          __global const "); source.append(numeric_string); source.append(" * y"); source.append(ss.str()); source.append(", \n");
+    source.append("          uint4 params_y"); source.append(ss.str()); source.append(", \n");
+  }
+  source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * group_buffer) \n");
+  source.append("{ \n");
+  source.append("  unsigned int entries_per_thread = (params_x.z - 1) / get_global_size(0) + 1; \n");
+  source.append("  unsigned int vec_start_index = get_group_id(0) * get_local_size(0) * entries_per_thread; \n");
+  source.append("  unsigned int vec_stop_index  = min((unsigned int)((get_group_id(0) + 1) * get_local_size(0) * entries_per_thread), params_x.z); \n");
+
+  // compute partial results within group:
+  for (vcl_size_t i=0; i<vector_num; ++i)
+  {
+    ss.str("");
+    ss << i;
+    source.append("  "); source.append(numeric_string); source.append(" tmp"); source.append(ss.str()); source.append(" = 0; \n");
+  }
+  source.append("  for (unsigned int i = vec_start_index + get_local_id(0); i < vec_stop_index; i += get_local_size(0)) { \n");
+  source.append("    ");  source.append(numeric_string); source.append(" val_x = x[i*params_x.y + params_x.x]; \n");
+  for (vcl_size_t i=0; i<vector_num; ++i)
+  {
+    ss.str("");
+    ss << i;
+    source.append("    tmp"); source.append(ss.str()); source.append(" += val_x * y"); source.append(ss.str()); source.append("[i * params_y"); source.append(ss.str()); source.append(".y + params_y"); source.append(ss.str()); source.append(".x]; \n");
+  }
+  source.append("  } \n");
+  for (vcl_size_t i=0; i<vector_num; ++i)
+  {
+    ss.str("");
+    ss << i;
+    source.append("  tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] = tmp"); source.append(ss.str()); source.append("; \n");
+  }
+
+  // now run reduction:
+  source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  for (vcl_size_t i=0; i<vector_num; ++i)
+  {
+    ss.str("");
+    ss << i;
+    source.append("      tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] += tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0) + stride]; \n");
+  }
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("  if (get_local_id(0) == 0) { \n");
+  for (vcl_size_t i=0; i<vector_num; ++i)
+  {
+    ss.str("");
+    ss << i;
+    source.append("    group_buffer[get_group_id(0) + "); source.append(ss.str()); source.append(" * get_num_groups(0)] = tmp_buffer["); source.append(ss.str()); source.append(" * get_local_size(0)]; \n");
+  }
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_norm(StringType & source, std::string const & numeric_string)
+{
+  bool is_float_or_double = (numeric_string == "float" || numeric_string == "double");
+
+  source.append(numeric_string); source.append(" impl_norm( \n");
+  source.append("          __global const "); source.append(numeric_string); source.append(" * vec, \n");
+  source.append("          unsigned int start1, \n");
+  source.append("          unsigned int inc1, \n");
+  source.append("          unsigned int size1, \n");
+  source.append("          unsigned int norm_selector, \n");
+  source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" tmp = 0; \n");
+  source.append("  if (norm_selector == 1) \n"); //norm_1
+  source.append("  { \n");
+  source.append("    for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
+  if (is_float_or_double)
+    source.append("      tmp += fabs(vec[i*inc1 + start1]); \n");
+  else if (numeric_string[0] == 'u') // abs may not be defined for unsigned types
+    source.append("      tmp += vec[i*inc1 + start1]; \n");
+  else
+    source.append("      tmp += abs(vec[i*inc1 + start1]); \n");
+  source.append("  } \n");
+  source.append("  else if (norm_selector == 2) \n"); //norm_2
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" vec_entry = 0; \n");
+  source.append("    for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
+  source.append("    { \n");
+  source.append("      vec_entry = vec[i*inc1 + start1]; \n");
+  source.append("      tmp += vec_entry * vec_entry; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("  else if (norm_selector == 0) \n"); //norm_inf
+  source.append("  { \n");
+  source.append("    for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
+  if (is_float_or_double)
+    source.append("      tmp = fmax(fabs(vec[i*inc1 + start1]), tmp); \n");
+  else if (numeric_string[0] == 'u') // abs may not be defined for unsigned types
+    source.append("      tmp = max(vec[i*inc1 + start1], tmp); \n");
+  else
+  {
+    source.append("      tmp = max(("); source.append(numeric_string); source.append(")abs(vec[i*inc1 + start1]), tmp); \n");
+  }
+  source.append("  } \n");
+
+  source.append("  tmp_buffer[get_local_id(0)] = tmp; \n");
+
+  source.append("  if (norm_selector > 0) \n"); //norm_1 or norm_2:
+  source.append("  { \n");
+  source.append("    for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("    { \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("      if (get_local_id(0) < stride) \n");
+  source.append("        tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride]; \n");
+  source.append("    } \n");
+  source.append("    return tmp_buffer[0]; \n");
+  source.append("  } \n");
+
+  //norm_inf:
+  source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) \n");
+  if (is_float_or_double)
+    source.append("      tmp_buffer[get_local_id(0)] = fmax(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
+  else
+    source.append("      tmp_buffer[get_local_id(0)] = max(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
+  source.append("  } \n");
+
+  source.append("  return tmp_buffer[0]; \n");
+  source.append("}; \n");
+
+  source.append("__kernel void norm( \n");
+  source.append("          __global const "); source.append(numeric_string); source.append(" * vec, \n");
+  source.append("          unsigned int start1, \n");
+  source.append("          unsigned int inc1, \n");
+  source.append("          unsigned int size1, \n");
+  source.append("          unsigned int norm_selector, \n");
+  source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * group_buffer) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" tmp = impl_norm(vec, \n");
+  source.append("                        (        get_group_id(0)  * size1) / get_num_groups(0) * inc1 + start1, \n");
+  source.append("                        inc1, \n");
+  source.append("                        (   (1 + get_group_id(0)) * size1) / get_num_groups(0) \n");
+  source.append("                      - (        get_group_id(0)  * size1) / get_num_groups(0), \n");
+  source.append("                        norm_selector, \n");
+  source.append("                        tmp_buffer); \n");
+
+  source.append("  if (get_local_id(0) == 0) \n");
+  source.append("    group_buffer[get_group_id(0)] = tmp; \n");
+  source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_inner_prod_sum(StringType & source, std::string const & numeric_string)
+{
+  // sums the array 'vec1' and writes to result. Each work group computes the inner product for a subvector of size 'size_per_workgroup'.
+  source.append("__kernel void sum_inner_prod( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
+  source.append("          unsigned int size_per_workgroup, \n");
+  source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("          unsigned int start_result, \n");
+  source.append("          unsigned int inc_result) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" thread_sum = 0; \n");
+  source.append("  for (unsigned int i = get_local_id(0); i<size_per_workgroup; i += get_local_size(0)) \n");
+  source.append("    thread_sum += vec1[size_per_workgroup * get_group_id(0) + i]; \n");
+
+  source.append("  tmp_buffer[get_local_id(0)] = thread_sum; \n");
+
+  source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) \n");
+  source.append("      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
+  source.append("  } \n");
+  source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("  if (get_local_id(0) == 0) \n");
+  source.append("    result[start_result + inc_result * get_group_id(0)] = tmp_buffer[0]; \n");
+  source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_sum(StringType & source, std::string const & numeric_string)
+{
+  // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+  source.append("__kernel void sum( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
+  source.append("          unsigned int start1, \n");
+  source.append("          unsigned int inc1, \n");
+  source.append("          unsigned int size1, \n");
+  source.append("          unsigned int option,  \n"); //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
+  source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * result) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" thread_sum = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" tmp = 0; \n");
+  source.append("  for (unsigned int i = get_local_id(0); i<size1; i += get_local_size(0)) \n");
+  source.append("  { \n");
+  source.append("    if (option > 0) \n");
+  source.append("      thread_sum += vec1[i*inc1+start1]; \n");
+  source.append("    else \n");
+  source.append("    { \n");
+  source.append("      tmp = vec1[i*inc1+start1]; \n");
+  source.append("      tmp = (tmp < 0) ? -tmp : tmp; \n");
+  source.append("      thread_sum = (thread_sum > tmp) ? thread_sum : tmp; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+
+  source.append("  tmp_buffer[get_local_id(0)] = thread_sum; \n");
+
+  source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) \n");
+  source.append("    { \n");
+  source.append("      if (option > 0) \n");
+  source.append("        tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
+  source.append("      else \n");
+  source.append("        tmp_buffer[get_local_id(0)] = (tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride]) ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("  if (get_global_id(0) == 0) \n");
+  source.append("  { \n");
+  if (numeric_string == "float" || numeric_string == "double")
+  {
+    source.append("    if (option == 2) \n");
+    source.append("      *result = sqrt(tmp_buffer[0]); \n");
+    source.append("    else \n");
+  }
+  source.append("      *result = tmp_buffer[0]; \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_index_norm_inf(StringType & source, std::string const & numeric_string)
+{
+  //index_norm_inf:
+  source.append("unsigned int index_norm_inf_impl( \n");
+  source.append("          __global const "); source.append(numeric_string); source.append(" * vec, \n");
+  source.append("          unsigned int start1, \n");
+  source.append("          unsigned int inc1, \n");
+  source.append("          unsigned int size1, \n");
+  source.append("          __local "); source.append(numeric_string); source.append(" * entry_buffer, \n");
+  source.append("          __local unsigned int * index_buffer) \n");
+  source.append("{ \n");
+  //step 1: fill buffer:
+  source.append("  "); source.append(numeric_string); source.append(" cur_max = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" tmp; \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+  source.append("  { \n");
+  if (numeric_string == "float" || numeric_string == "double")
+    source.append("    tmp = fabs(vec[i*inc1+start1]); \n");
+  else if (numeric_string[0] == 'u') // abs may not be defined for unsigned types
+    source.append("    tmp = vec[i*inc1+start1]; \n");
+  else
+    source.append("    tmp = abs(vec[i*inc1+start1]); \n");
+  source.append("    if (cur_max < tmp) \n");
+  source.append("    { \n");
+  source.append("      entry_buffer[get_global_id(0)] = tmp; \n");
+  source.append("      index_buffer[get_global_id(0)] = i; \n");
+  source.append("      cur_max = tmp; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+
+  //step 2: parallel reduction:
+  source.append("  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_global_id(0) < stride) \n");
+  source.append("   { \n");
+  //find the first occurring index
+  source.append("      if (entry_buffer[get_global_id(0)] < entry_buffer[get_global_id(0)+stride]) \n");
+  source.append("      { \n");
+  source.append("        index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride]; \n");
+  source.append("        entry_buffer[get_global_id(0)] = entry_buffer[get_global_id(0)+stride]; \n");
+  source.append("      } \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append(" \n");
+  source.append("  return index_buffer[0]; \n");
+  source.append("} \n");
+
+  source.append("__kernel void index_norm_inf( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * vec, \n");
+  source.append("          unsigned int start1, \n");
+  source.append("          unsigned int inc1, \n");
+  source.append("          unsigned int size1, \n");
+  source.append("          __local "); source.append(numeric_string); source.append(" * entry_buffer, \n");
+  source.append("          __local unsigned int * index_buffer, \n");
+  source.append("          __global unsigned int * result) \n");
+  source.append("{ \n");
+  source.append("  entry_buffer[get_global_id(0)] = 0; \n");
+  source.append("  index_buffer[get_global_id(0)] = 0; \n");
+  source.append("  unsigned int tmp = index_norm_inf_impl(vec, start1, inc1, size1, entry_buffer, index_buffer); \n");
+  source.append("  if (get_global_id(0) == 0) *result = tmp; \n");
+  source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_maxmin(StringType & source, std::string const & numeric_string, bool is_max)
+{
+  // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+  if (is_max)
+    source.append("__kernel void max_kernel( \n");
+  else
+    source.append("__kernel void min_kernel( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
+  source.append("          unsigned int start1, \n");
+  source.append("          unsigned int inc1, \n");
+  source.append("          unsigned int size1, \n");
+  source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * result) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" thread_result = vec1[start1]; \n");
+  source.append("  for (unsigned int i = get_global_id(0); i<size1; i += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" tmp = vec1[i*inc1+start1]; \n");
+  if (is_max)
+    source.append("      thread_result = thread_result > tmp ? thread_result : tmp; \n");
+  else
+    source.append("      thread_result = thread_result < tmp ? thread_result : tmp; \n");
+  source.append("  } \n");
+
+  source.append("  tmp_buffer[get_local_id(0)] = thread_result; \n");
+
+  source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) \n");
+  source.append("    { \n");
+  if (is_max)
+    source.append("        tmp_buffer[get_local_id(0)] = tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride] ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
+  else
+    source.append("        tmp_buffer[get_local_id(0)] = tmp_buffer[get_local_id(0)] < tmp_buffer[get_local_id(0) + stride] ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("  if (get_local_id(0) == 0) \n");
+  source.append("    result[get_group_id(0)] = tmp_buffer[0]; \n");
+  source.append("} \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without involving matrices, multiple inner products, or element-wise operations other than addition or subtraction. */
+template<typename NumericT>
+struct vector
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_vector";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+    std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      std::string source;
+      source.reserve(8192);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // fully parametrized kernels:
+      generate_avbv(source, numeric_string);
+
+      // kernels with mostly predetermined skeleton:
+      generate_plane_rotation(source, numeric_string);
+      generate_vector_swap(source, numeric_string);
+      generate_assign_cpu(source, numeric_string);
+
+      generate_inner_prod(source, numeric_string, 1);
+      generate_norm(source, numeric_string);
+      generate_sum(source, numeric_string);
+      generate_index_norm_inf(source, numeric_string);
+      generate_maxmin(source, numeric_string, true);
+      generate_maxmin(source, numeric_string, false);
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+// class with kernels for multiple inner products.
+/** @brief Main kernel class for generating OpenCL kernels for multiple inner products on/with viennacl::vector<>. */
+template<typename NumericT>
+struct vector_multi_inner_prod
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_vector_multi";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+    std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      std::string source;
+      source.reserve(8192);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      generate_inner_prod(source, numeric_string, 2);
+      generate_inner_prod(source, numeric_string, 3);
+      generate_inner_prod(source, numeric_string, 4);
+      generate_inner_prod(source, numeric_string, 8);
+
+      generate_inner_prod_sum(source, numeric_string);
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+
+template<typename StringT>
+void generate_vector_convert(StringT & source, std::string const & dest_type, std::string const & src_type)
+{
+ source.append(" __kernel void convert_" + dest_type + "_" + src_type + "( \n");
+ source.append("  __global " + dest_type + " * dest, \n");
+ source.append("  unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest, \n");
+ source.append("  __global const " + src_type + " * src, \n");
+ source.append("  unsigned int start_src, unsigned int inc_src) \n");
+ source.append("  { \n");
+ source.append("   for (unsigned int i = get_global_id(0); i < size_dest; i += get_global_size(0)) \n");
+ source.append("     dest[start_dest + i * inc_dest] = src[start_src + i * inc_src]; \n");
+ source.append("  } \n");
+}
+
+/** @brief Main kernel class for vector conversion routines (e.g. convert vector<int> to vector<float>). */
+struct vector_convert
+{
+
+public:
+  static std::string program_name()
+  {
+    return "vector_convert";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      std::string source;
+      source.reserve(4096);
+
+      // int
+      generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<int>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<long>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+      // unsigned int
+      generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<int>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<long>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+      // long
+      generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<int>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<long>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+      // unsigned long
+      generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<int>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<long>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+      // float
+      generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<int>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<long>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+      generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+      if (ctx.current_device().double_support())
+      {
+        viennacl::ocl::append_double_precision_pragma<double>(ctx, source);
+
+        generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(),           viennacl::ocl::type_to_string<double>::apply());
+        generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(),  viennacl::ocl::type_to_string<double>::apply());
+        generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(),          viennacl::ocl::type_to_string<double>::apply());
+        generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<double>::apply());
+        generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(),         viennacl::ocl::type_to_string<double>::apply());
+
+        generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<int>::apply());
+        generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+        generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<long>::apply());
+        generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+        generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<float>::apply());
+        generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<double>::apply());
+      }
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+
+};
+
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector_element.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector_element.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector_element.hpp
new file mode 100644
index 0000000..8445302
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector_element.hpp
@@ -0,0 +1,163 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_ELEMENT_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_ELEMENT_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/vector_element.hpp
+ *  @brief OpenCL kernel file for element-wise vector operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+
+//generate code for C = op1(A) * op2(B), where A, B, C can have different storage layouts and opX(D) = D or trans(D)
+template <typename StringT>
+void generate_vector_unary_element_ops(StringT & source, std::string const & numeric_string,
+                                       std::string const & funcname, std::string const & op, std::string const & op_name)
+{
+  source.append("__kernel void "); source.append(funcname); source.append("_"); source.append(op_name); source.append("(\n");
+  source.append("    __global "); source.append(numeric_string); source.append(" * vec1, \n");
+  source.append("    uint4 size1, \n");
+  source.append("    __global "); source.append(numeric_string); source.append(" * vec2, \n");
+  source.append("    uint4 size2) { \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n");
+  if (numeric_string[0] == 'u' && funcname == "abs") // abs() on unsigned does not work on MacOS X 10.6.8, so we use the identity:
+  {
+    source.append("    vec1[i*size1.y+size1.x] "); source.append(op); source.append(" vec2[i*size2.y+size2.x]; \n");
+  }
+  else
+  {
+    source.append("    vec1[i*size1.y+size1.x] "); source.append(op); source.append(" "); source.append(funcname); source.append("(vec2[i*size2.y+size2.x]); \n");
+  }
+  source.append("} \n");
+}
+
+template <typename StringT>
+void generate_vector_unary_element_ops(StringT & source, std::string const & numeric_string, std::string const & funcname)
+{
+  generate_vector_unary_element_ops(source, numeric_string, funcname, "=", "assign");
+  //generate_vector_unary_element_ops(source, numeric_string, funcname, "+=", "plus");
+  //generate_vector_unary_element_ops(source, numeric_string, funcname, "-=", "minus");
+}
+
+template <typename StringT>
+void generate_vector_binary_element_ops(StringT & source, std::string const & numeric_string, int op_type) //op_type: {0: product, 1: division, 2: power}
+{
+  std::string kernel_name_suffix;
+  if (op_type == 0)
+    kernel_name_suffix = "prod";
+  else if (op_type == 1)
+    kernel_name_suffix = "div";
+  else
+    kernel_name_suffix = "pow";
+
+  // generic kernel for the vector operation v1 = alpha * v2 + beta * v3, where v1, v2, v3 are not necessarily distinct vectors
+  source.append("__kernel void element_" + kernel_name_suffix + "(\n");
+  source.append("    __global "); source.append(numeric_string); source.append(" * vec1, \n");
+  source.append("    unsigned int start1, \n");
+  source.append("    unsigned int inc1, \n");
+  source.append("    unsigned int size1, \n");
+
+  source.append("    __global const "); source.append(numeric_string); source.append(" * vec2, \n");
+  source.append("    unsigned int start2, \n");
+  source.append("    unsigned int inc2, \n");
+
+  source.append("    __global const "); source.append(numeric_string); source.append(" * vec3, \n");
+  source.append("   unsigned int start3, \n");
+  source.append("   unsigned int inc3, \n");
+
+  source.append("   unsigned int op_type) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+  if (op_type == 0)
+    source.append("    vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3]; \n");
+  else if (op_type == 1)
+    source.append("    vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3]; \n");
+  else if (op_type == 2)
+    source.append("    vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]); \n");
+
+  source.append("} \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for elementwise operations other than addition and subtraction on/with viennacl::vector<>. */
+template<typename NumericT>
+struct vector_element
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_vector_element";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+    std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      std::string source;
+      source.reserve(8192);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // unary operations
+      if (numeric_string == "float" || numeric_string == "double")
+      {
+        generate_vector_unary_element_ops(source, numeric_string, "acos");
+        generate_vector_unary_element_ops(source, numeric_string, "asin");
+        generate_vector_unary_element_ops(source, numeric_string, "atan");
+        generate_vector_unary_element_ops(source, numeric_string, "ceil");
+        generate_vector_unary_element_ops(source, numeric_string, "cos");
+        generate_vector_unary_element_ops(source, numeric_string, "cosh");
+        generate_vector_unary_element_ops(source, numeric_string, "exp");
+        generate_vector_unary_element_ops(source, numeric_string, "fabs");
+        generate_vector_unary_element_ops(source, numeric_string, "floor");
+        generate_vector_unary_element_ops(source, numeric_string, "log");
+        generate_vector_unary_element_ops(source, numeric_string, "log10");
+        generate_vector_unary_element_ops(source, numeric_string, "sin");
+        generate_vector_unary_element_ops(source, numeric_string, "sinh");
+        generate_vector_unary_element_ops(source, numeric_string, "sqrt");
+        generate_vector_unary_element_ops(source, numeric_string, "tan");
+        generate_vector_unary_element_ops(source, numeric_string, "tanh");
+      }
+      else
+      {
+        generate_vector_unary_element_ops(source, numeric_string, "abs");
+      }
+
+      // binary operations
+      generate_vector_binary_element_ops(source, numeric_string, 0);
+      generate_vector_binary_element_ops(source, numeric_string, 1);
+      if (numeric_string == "float" || numeric_string == "double")
+        generate_vector_binary_element_ops(source, numeric_string, 2);
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+


[45/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/coordinate_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/coordinate_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/coordinate_matrix.hpp
new file mode 100644
index 0000000..2a24a4e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/coordinate_matrix.hpp
@@ -0,0 +1,506 @@
+#ifndef VIENNACL_COORDINATE_MATRIX_HPP_
+#define VIENNACL_COORDINATE_MATRIX_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/coordinate_matrix.hpp
+    @brief Implementation of the coordinate_matrix class
+*/
+
+#include <map>
+#include <vector>
+#include <list>
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+namespace viennacl
+{
+
+
+//provide copy-operation:
+/** @brief Copies a sparse matrix from the host to the OpenCL device (either GPU or multi-core CPU)
+  *
+  * For the requirements on the CPUMatrixT type, see the documentation of the function copy(CPUMatrixT, compressed_matrix<>)
+  *
+  * @param cpu_matrix   A sparse matrix on the host.
+  * @param gpu_matrix   A compressed_matrix from ViennaCL
+  */
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const CPUMatrixT & cpu_matrix,
+          coordinate_matrix<NumericT, AlignmentV> & gpu_matrix )
+{
+  assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  vcl_size_t group_num = 64;
+
+  // Step 1: Determine nonzeros:
+  if ( cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0 )
+  {
+    vcl_size_t num_entries = 0;
+    for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+      for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+        ++num_entries;
+
+    // Step 2: Set up matrix data:
+    gpu_matrix.nonzeros_ = num_entries;
+    gpu_matrix.rows_ = cpu_matrix.size1();
+    gpu_matrix.cols_ = cpu_matrix.size2();
+
+    viennacl::backend::typesafe_host_array<unsigned int> group_boundaries(gpu_matrix.handle3(), group_num + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> coord_buffer(gpu_matrix.handle12(), 2*gpu_matrix.internal_nnz());
+    std::vector<NumericT> elements(gpu_matrix.internal_nnz());
+
+    vcl_size_t data_index = 0;
+    vcl_size_t current_fraction = 0;
+
+    group_boundaries.set(0, 0);
+    for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1();  row_it != cpu_matrix.end1(); ++row_it)
+    {
+      for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+      {
+        coord_buffer.set(2*data_index, col_it.index1());
+        coord_buffer.set(2*data_index + 1, col_it.index2());
+        elements[data_index] = *col_it;
+        ++data_index;
+      }
+
+      while (data_index > vcl_size_t(static_cast<double>(current_fraction + 1) / static_cast<double>(group_num)) * num_entries)    //split data equally over 64 groups
+        group_boundaries.set(++current_fraction, data_index);
+    }
+
+    //write end of last group:
+    group_boundaries.set(group_num, data_index);
+    //group_boundaries[1] = data_index; //for one compute unit
+
+    //std::cout << "Group boundaries: " << std::endl;
+    //for (vcl_size_t i=0; i<group_boundaries.size(); ++i)
+    //  std::cout << group_boundaries[i] << std::endl;
+
+    viennacl::backend::memory_create(gpu_matrix.group_boundaries_, group_boundaries.raw_size(), traits::context(gpu_matrix.group_boundaries_), group_boundaries.get());
+    viennacl::backend::memory_create(gpu_matrix.coord_buffer_,         coord_buffer.raw_size(), traits::context(gpu_matrix.coord_buffer_),     coord_buffer.get());
+    viennacl::backend::memory_create(gpu_matrix.elements_,  sizeof(NumericT)*elements.size(), traits::context(gpu_matrix.elements_),         &(elements[0]));
+  }
+}
+
+/** @brief Copies a sparse matrix in the std::vector< std::map < > > format to an OpenCL device.
+  *
+  * @param cpu_matrix   A sparse square matrix on the host.
+  * @param gpu_matrix   A coordinate_matrix from ViennaCL
+  */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(const std::vector< std::map<unsigned int, NumericT> > & cpu_matrix,
+          coordinate_matrix<NumericT, AlignmentV> & gpu_matrix )
+{
+  vcl_size_t max_col = 0;
+  for (vcl_size_t i=0; i<cpu_matrix.size(); ++i)
+  {
+    if (cpu_matrix[i].size() > 0)
+      max_col = std::max<vcl_size_t>(max_col, (cpu_matrix[i].rbegin())->first);
+  }
+
+  viennacl::copy(tools::const_sparse_matrix_adapter<NumericT>(cpu_matrix, cpu_matrix.size(), max_col + 1), gpu_matrix);
+}
+
+//gpu to cpu:
+/** @brief Copies a sparse matrix from the OpenCL device (either GPU or multi-core CPU) to the host.
+  *
+  * There are two type requirements on the CPUMatrixT type (fulfilled by e.g. boost::numeric::ublas):
+  * - resize(rows, cols)  A resize function to bring the matrix into the correct size
+  * - operator(i,j)       Write new entries via the parenthesis operator
+  *
+  * @param gpu_matrix   A coordinate_matrix from ViennaCL
+  * @param cpu_matrix   A sparse matrix on the host.
+  */
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const coordinate_matrix<NumericT, AlignmentV> & gpu_matrix,
+          CPUMatrixT & cpu_matrix )
+{
+  assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+  {
+    //get raw data from memory:
+    viennacl::backend::typesafe_host_array<unsigned int> coord_buffer(gpu_matrix.handle12(), 2*gpu_matrix.nnz());
+    std::vector<NumericT> elements(gpu_matrix.nnz());
+
+    //std::cout << "GPU nonzeros: " << gpu_matrix.nnz() << std::endl;
+
+    viennacl::backend::memory_read(gpu_matrix.handle12(), 0, coord_buffer.raw_size(), coord_buffer.get());
+    viennacl::backend::memory_read(gpu_matrix.handle(),   0, sizeof(NumericT) * elements.size(), &(elements[0]));
+
+    //fill the cpu_matrix:
+    for (vcl_size_t index = 0; index < gpu_matrix.nnz(); ++index)
+      cpu_matrix(coord_buffer[2*index], coord_buffer[2*index+1]) = elements[index];
+
+  }
+}
+
+/** @brief Copies a sparse matrix from an OpenCL device to the host. The host type is the std::vector< std::map < > > format .
+  *
+  * @param gpu_matrix   A coordinate_matrix from ViennaCL
+  * @param cpu_matrix   A sparse matrix on the host.
+  */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(const coordinate_matrix<NumericT, AlignmentV> & gpu_matrix,
+          std::vector< std::map<unsigned int, NumericT> > & cpu_matrix)
+{
+  if (cpu_matrix.size() == 0)
+    cpu_matrix.resize(gpu_matrix.size1());
+
+  assert(cpu_matrix.size() == gpu_matrix.size1() && bool("Matrix dimension mismatch!"));
+
+  tools::sparse_matrix_adapter<NumericT> temp(cpu_matrix, gpu_matrix.size1(), gpu_matrix.size2());
+  copy(gpu_matrix, temp);
+}
+
+
+//////////////////////// coordinate_matrix //////////////////////////
+/** @brief A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row and column indices and val denotes the entry.
+  *
+  * The present implementation of coordinate_matrix suffers from poor runtime efficiency. Users are adviced to use compressed_matrix in the meanwhile.
+  *
+  * @tparam NumericT    The floating point type (either float or double, checked at compile time)
+  * @tparam AlignmentV     The internal memory size for the arrays, given by (size()/AlignmentV + 1) * AlignmentV. AlignmentV must be a power of two.
+  */
+template<class NumericT, unsigned int AlignmentV /* see forwards.h */ >
+class coordinate_matrix
+{
+public:
+  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
+  typedef vcl_size_t                                                                                 size_type;
+
+  /** @brief Default construction of a coordinate matrix. No memory is allocated */
+  coordinate_matrix() : rows_(0), cols_(0), nonzeros_(0), group_num_(64) {}
+
+  explicit coordinate_matrix(viennacl::context ctx) : rows_(0), cols_(0), nonzeros_(0), group_num_(64)
+  {
+    group_boundaries_.switch_active_handle_id(ctx.memory_type());
+    coord_buffer_.switch_active_handle_id(ctx.memory_type());
+    elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+    if (ctx.memory_type() == OPENCL_MEMORY)
+    {
+      group_boundaries_.opencl_handle().context(ctx.opencl_context());
+      coord_buffer_.opencl_handle().context(ctx.opencl_context());
+      elements_.opencl_handle().context(ctx.opencl_context());
+    }
+#endif
+  }
+
+  /** @brief Construction of a coordinate matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+      *
+      * @param rows     Number of rows
+      * @param cols     Number of columns
+      * @param nonzeros Optional number of nonzeros for memory preallocation
+      * @param ctx      Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+      */
+  coordinate_matrix(vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzeros = 0, viennacl::context ctx = viennacl::context()) :
+    rows_(rows), cols_(cols), nonzeros_(nonzeros)
+  {
+    if (nonzeros > 0)
+    {
+      viennacl::backend::memory_create(group_boundaries_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (group_num_ + 1), ctx);
+      viennacl::backend::memory_create(coord_buffer_,     viennacl::backend::typesafe_host_array<unsigned int>().element_size() * 2 * internal_nnz(), ctx);
+      viennacl::backend::memory_create(elements_,         sizeof(NumericT) * internal_nnz(), ctx);
+    }
+    else
+    {
+      group_boundaries_.switch_active_handle_id(ctx.memory_type());
+      coord_buffer_.switch_active_handle_id(ctx.memory_type());
+      elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+      if (ctx.memory_type() == OPENCL_MEMORY)
+      {
+        group_boundaries_.opencl_handle().context(ctx.opencl_context());
+        coord_buffer_.opencl_handle().context(ctx.opencl_context());
+        elements_.opencl_handle().context(ctx.opencl_context());
+      }
+#endif
+    }
+  }
+
+  /** @brief Construction of a coordinate matrix with the supplied number of rows and columns in the supplied context. Does not yet allocate memory.
+      *
+      * @param rows     Number of rows
+      * @param cols     Number of columns
+      * @param ctx      Context in which to create the matrix
+      */
+  explicit coordinate_matrix(vcl_size_t rows, vcl_size_t cols, viennacl::context ctx)
+    : rows_(rows), cols_(cols), nonzeros_(0)
+  {
+    group_boundaries_.switch_active_handle_id(ctx.memory_type());
+    coord_buffer_.switch_active_handle_id(ctx.memory_type());
+    elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+    if (ctx.memory_type() == OPENCL_MEMORY)
+    {
+      group_boundaries_.opencl_handle().context(ctx.opencl_context());
+      coord_buffer_.opencl_handle().context(ctx.opencl_context());
+      elements_.opencl_handle().context(ctx.opencl_context());
+    }
+#endif
+  }
+
+
+  /** @brief Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved. */
+  void reserve(vcl_size_t new_nonzeros)
+  {
+    if (new_nonzeros > nonzeros_)  //TODO: Do we need to initialize new memory with zero?
+    {
+      handle_type coord_buffer_old;
+      handle_type elements_old;
+      viennacl::backend::memory_shallow_copy(coord_buffer_, coord_buffer_old);
+      viennacl::backend::memory_shallow_copy(elements_, elements_old);
+
+      vcl_size_t internal_new_nnz = viennacl::tools::align_to_multiple<vcl_size_t>(new_nonzeros, AlignmentV);
+      viennacl::backend::typesafe_host_array<unsigned int> size_deducer(coord_buffer_);
+      viennacl::backend::memory_create(coord_buffer_, size_deducer.element_size() * 2 * internal_new_nnz, viennacl::traits::context(coord_buffer_));
+      viennacl::backend::memory_create(elements_,     sizeof(NumericT)  * internal_new_nnz,             viennacl::traits::context(elements_));
+
+      viennacl::backend::memory_copy(coord_buffer_old, coord_buffer_, 0, 0, size_deducer.element_size() * 2 * nonzeros_);
+      viennacl::backend::memory_copy(elements_old,     elements_,     0, 0, sizeof(NumericT)  * nonzeros_);
+
+      nonzeros_ = new_nonzeros;
+    }
+  }
+
+  /** @brief Resize the matrix.
+      *
+      * @param new_size1    New number of rows
+      * @param new_size2    New number of columns
+      * @param preserve     If true, the old values are preserved. At present, old values are always discarded.
+      */
+  void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve = true)
+  {
+    assert (new_size1 > 0 && new_size2 > 0);
+
+    if (new_size1 < rows_ || new_size2 < cols_) //enlarge buffer
+    {
+      std::vector<std::map<unsigned int, NumericT> > stl_sparse_matrix;
+      if (rows_ > 0)
+        stl_sparse_matrix.resize(rows_);
+
+      if (preserve && rows_ > 0)
+        viennacl::copy(*this, stl_sparse_matrix);
+
+      stl_sparse_matrix.resize(new_size1);
+
+      //std::cout << "Cropping STL matrix of size " << stl_sparse_matrix.size() << std::endl;
+      if (new_size2 < cols_ && rows_ > 0)
+      {
+        for (vcl_size_t i=0; i<stl_sparse_matrix.size(); ++i)
+        {
+          std::list<unsigned int> to_delete;
+          for (typename std::map<unsigned int, NumericT>::iterator it = stl_sparse_matrix[i].begin();
+               it != stl_sparse_matrix[i].end();
+               ++it)
+          {
+            if (it->first >= new_size2)
+              to_delete.push_back(it->first);
+          }
+
+          for (std::list<unsigned int>::iterator it = to_delete.begin(); it != to_delete.end(); ++it)
+            stl_sparse_matrix[i].erase(*it);
+        }
+        //std::cout << "Cropping done..." << std::endl;
+      }
+
+      rows_ = new_size1;
+      cols_ = new_size2;
+      viennacl::copy(stl_sparse_matrix, *this);
+    }
+
+    rows_ = new_size1;
+    cols_ = new_size2;
+  }
+
+  /** @brief Resets all entries in the matrix back to zero without changing the matrix size. Resets the sparsity pattern. */
+  void clear()
+  {
+    viennacl::backend::typesafe_host_array<unsigned int> host_group_buffer(group_boundaries_, 65);
+    viennacl::backend::typesafe_host_array<unsigned int> host_coord_buffer(coord_buffer_, 2);
+    std::vector<NumericT> host_elements(1);
+
+    viennacl::backend::memory_create(group_boundaries_, host_group_buffer.element_size() * 65, viennacl::traits::context(group_boundaries_), host_group_buffer.get());
+    viennacl::backend::memory_create(coord_buffer_,     host_coord_buffer.element_size() * 2,   viennacl::traits::context(coord_buffer_),     host_coord_buffer.get());
+    viennacl::backend::memory_create(elements_,         sizeof(NumericT) * 1,                   viennacl::traits::context(elements_),         &(host_elements[0]));
+
+    nonzeros_ = 0;
+    group_num_ = 64;
+  }
+
+  /** @brief  Returns the number of rows */
+  vcl_size_t size1() const { return rows_; }
+  /** @brief  Returns the number of columns */
+  vcl_size_t size2() const { return cols_; }
+  /** @brief  Returns the number of nonzero entries */
+  vcl_size_t nnz() const { return nonzeros_; }
+  /** @brief  Returns the number of internal nonzero entries */
+  vcl_size_t internal_nnz() const { return viennacl::tools::align_to_multiple<vcl_size_t>(nonzeros_, AlignmentV); }
+
+  /** @brief  Returns the OpenCL handle to the (row, column) index array */
+  const handle_type & handle12() const { return coord_buffer_; }
+  /** @brief  Returns the OpenCL handle to the matrix entry array */
+  const handle_type & handle() const { return elements_; }
+  /** @brief  Returns the OpenCL handle to the group start index array */
+  const handle_type & handle3() const { return group_boundaries_; }
+
+  vcl_size_t groups() const { return group_num_; }
+
+#if defined(_MSC_VER) && _MSC_VER < 1500      //Visual Studio 2005 needs special treatment
+  template<typename CPUMatrixT>
+  friend void copy(const CPUMatrixT & cpu_matrix, coordinate_matrix & gpu_matrix );
+#else
+  template<typename CPUMatrixT, typename NumericT2, unsigned int AlignmentV2>
+  friend void copy(const CPUMatrixT & cpu_matrix, coordinate_matrix<NumericT2, AlignmentV2> & gpu_matrix );
+#endif
+
+private:
+  /** @brief Copy constructor is by now not available. */
+  coordinate_matrix(coordinate_matrix const &);
+
+  /** @brief Assignment is by now not available. */
+  coordinate_matrix & operator=(coordinate_matrix const &);
+
+
+  vcl_size_t rows_;
+  vcl_size_t cols_;
+  vcl_size_t nonzeros_;
+  vcl_size_t group_num_;
+  handle_type coord_buffer_;
+  handle_type elements_;
+  handle_type group_boundaries_;
+};
+
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+  // x = A * y
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x = A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs = temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(0));
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x += A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs += temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(1));
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x -= A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs -= temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(-1), lhs, T(1));
+    }
+  };
+
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+    }
+  };
+
+  // x += A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs += temp_result;
+    }
+  };
+
+  // x -= A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs -= temp_result;
+    }
+  };
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/detail/matrix_def.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/detail/matrix_def.hpp b/native-viennaCL/src/main/cpp/viennacl/detail/matrix_def.hpp
new file mode 100644
index 0000000..c13ef01
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/detail/matrix_def.hpp
@@ -0,0 +1,270 @@
+#ifndef VIENNACL_DETAIL_MATRIX_DEF_HPP_
+#define VIENNACL_DETAIL_MATRIX_DEF_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/detail/matrix_def.hpp
+    @brief Forward declaration of dense matrix classes
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/entry_proxy.hpp"
+
+namespace viennacl
+{
+/** @brief Base class for representing matrices where the individual entries are not all stored explicitly, e.g. identity_matrix<>
+  *
+  * Examples are identity_matrix, scalar_matrix, and zero_matrix.
+  */
+template<typename NumericT>
+class implicit_matrix_base
+{
+protected:
+  typedef vcl_size_t        size_type;
+  implicit_matrix_base(size_type size1, size_type size2, NumericT value, bool diag, viennacl::context ctx) : size1_(size1), size2_(size2), value_(value), diag_(diag), off_diag_(0), ctx_(ctx){ }
+public:
+  typedef NumericT const & const_reference;
+  typedef NumericT cpu_value_type;
+
+  size_type size1() const { return size1_; }
+  size_type size2() const { return size2_; }
+  viennacl::context context() const { return ctx_; }
+  NumericT  value() const { return value_; }
+  bool diag() const { return diag_; }
+
+  const_reference operator()(size_type i, size_type j) const
+  {
+    if (diag_) return (i == j) ? value_ : off_diag_;
+    return value_;
+  }
+protected:
+  size_type size1_;
+  size_type size2_;
+  NumericT value_;
+  bool diag_;
+  NumericT off_diag_;
+  viennacl::context ctx_;
+};
+
+//
+// Initializer types
+//
+/** @brief Represents a vector consisting of 1 at a given index and zeros otherwise. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+template<typename NumericT>
+class identity_matrix : public implicit_matrix_base<NumericT>
+{
+public:
+  typedef vcl_size_t         size_type;
+  typedef NumericT const & const_reference;
+
+  identity_matrix(size_type s, viennacl::context ctx = viennacl::context()) : implicit_matrix_base<NumericT>(s, s, 1, true, ctx){}
+};
+
+
+/** @brief Represents a vector consisting of zeros only. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+template<typename NumericT>
+class zero_matrix : public implicit_matrix_base<NumericT>
+{
+public:
+  typedef vcl_size_t         size_type;
+  typedef NumericT const & const_reference;
+
+  zero_matrix(size_type s1, size_type s2, viennacl::context ctx = viennacl::context()) : implicit_matrix_base<NumericT>(s1, s2, 0, false, ctx){}
+};
+
+
+/** @brief Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+template<typename NumericT>
+class scalar_matrix : public implicit_matrix_base<NumericT>
+{
+public:
+  typedef vcl_size_t         size_type;
+  typedef NumericT const & const_reference;
+
+  scalar_matrix(size_type s1, size_type s2, const_reference val, viennacl::context ctx = viennacl::context()) : implicit_matrix_base<NumericT>(s1, s2, val, false, ctx) {}
+};
+
+template<class NumericT, typename SizeT, typename DistanceT>
+class matrix_base
+{
+  typedef matrix_base<NumericT, SizeT, DistanceT>          self_type;
+public:
+
+  typedef matrix_iterator<row_iteration, self_type >   iterator1;
+  typedef matrix_iterator<col_iteration, self_type >   iterator2;
+  typedef scalar<NumericT>                                                  value_type;
+  typedef NumericT                                                          cpu_value_type;
+  typedef SizeT                                                            size_type;
+  typedef DistanceT                                                        difference_type;
+  typedef viennacl::backend::mem_handle                                       handle_type;
+
+  /** @brief The default constructor. Does not allocate any memory. */
+  explicit matrix_base(): size1_(0), size2_(0), start1_(0), start2_(0), stride1_(1), stride2_(1), internal_size1_(0), internal_size2_(0), row_major_fixed_(false), row_major_(true) {}
+
+  /** @brief The layout constructor. Does not allocate any memory. */
+  explicit matrix_base(bool is_row_major) : size1_(0), size2_(0), start1_(0), start2_(0), stride1_(1), stride2_(1), internal_size1_(0), internal_size2_(0), row_major_fixed_(true), row_major_(is_row_major) {}
+
+  /** @brief Creates the matrix with the given dimensions
+    *
+    * @param rows     Number of rows
+    * @param columns  Number of columns
+    * @param is_row_major  Boolean flag stating whether this matrix is stored row-major
+    * @param ctx      Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+    */
+  explicit matrix_base(size_type rows, size_type columns, bool is_row_major, viennacl::context ctx = viennacl::context());
+
+  /** @brief Constructor for creating a matrix_range or matrix_stride from some other matrix/matrix_range/matrix_stride */
+  explicit matrix_base(viennacl::backend::mem_handle & h,
+                       size_type mat_size1, size_type mat_start1, size_type mat_stride1, size_type mat_internal_size1,
+                       size_type mat_size2, size_type mat_start2, size_type mat_stride2, size_type mat_internal_size2,
+                       bool is_row_major): size1_(mat_size1), size2_(mat_size2),
+        start1_(mat_start1), start2_(mat_start2),
+        stride1_(mat_stride1), stride2_(mat_stride2),
+        internal_size1_(mat_internal_size1), internal_size2_(mat_internal_size2),
+        row_major_fixed_(true), row_major_(is_row_major),
+        elements_(h) {}
+
+
+  template<typename LHS, typename RHS, typename OP>
+  explicit matrix_base(matrix_expression<const LHS, const RHS, OP> const & proxy);
+
+  // CUDA or host memory:
+  explicit matrix_base(NumericT * ptr_to_mem, viennacl::memory_types mem_type,
+                       size_type mat_size1, size_type mat_start1, size_type mat_stride1, size_type mat_internal_size1,
+                       size_type mat_size2, size_type mat_start2, size_type mat_stride2, size_type mat_internal_size2,
+                       bool is_row_major);
+
+#ifdef VIENNACL_WITH_OPENCL
+  explicit matrix_base(cl_mem mem, size_type rows, size_type columns, bool is_row_major, viennacl::context ctx = viennacl::context());
+  explicit matrix_base(cl_mem mem, viennacl::context ctx,
+                       size_type mat_size1, size_type mat_start1, size_type mat_stride1, size_type mat_internal_size1,
+                       size_type mat_size2, size_type mat_start2, size_type mat_stride2, size_type mat_internal_size2,
+                       bool is_row_major);
+#endif
+
+  /* Copy CTOR */
+  matrix_base(const self_type & other);
+
+  /* Conversion CTOR */
+  template<typename OtherNumericT>
+  matrix_base(const matrix_base<OtherNumericT, SizeT, DistanceT> & other);
+
+  self_type & operator=(const self_type & other);
+  template<typename OtherNumericT>
+  self_type & operator=(const matrix_base<OtherNumericT, SizeT, DistanceT> & other);
+
+  /** @brief Implementation of the operation m1 = m2 @ alpha, where @ denotes either multiplication or division, and alpha is either a CPU or a GPU scalar
+    * @param proxy  An expression template proxy class. */
+  template<typename LHS, typename RHS, typename OP>
+  self_type & operator=(const matrix_expression<const LHS, const RHS, OP> & proxy);
+  // A = trans(B). Currently achieved in CPU memory
+  self_type & operator=(const matrix_expression< const self_type, const self_type, op_trans> & proxy);
+  template<typename LHS, typename RHS, typename OP>
+  self_type & operator+=(const matrix_expression<const LHS, const RHS, OP> & proxy);
+  template<typename LHS, typename RHS, typename OP>
+  self_type & operator-=(const matrix_expression<const LHS, const RHS, OP> & proxy);
+  /** @brief Assigns the supplied identity matrix to the matrix. */
+  self_type & operator = (identity_matrix<NumericT> const & m);
+  /** @brief Assigns the supplied zero matrix to the matrix. */
+  self_type & operator = (zero_matrix<NumericT> const & m);
+  /** @brief Assigns the supplied scalar vector to the matrix. */
+  self_type & operator = (scalar_matrix<NumericT> const & m);
+  //read-write access to an element of the matrix/matrix_range/matrix_slice
+  /** @brief Read-write access to a single element of the matrix/matrix_range/matrix_slice */
+  entry_proxy<NumericT> operator()(size_type row_index, size_type col_index);
+  /** @brief Read access to a single element of the matrix/matrix_range/matrix_slice */
+  const_entry_proxy<NumericT> operator()(size_type row_index, size_type col_index) const;
+  self_type & operator += (const self_type & other);
+  self_type & operator -= (const self_type & other);
+
+  /** @brief Scales the matrix by a char (8-bit integer) */
+  self_type & operator *= (char val);
+  /** @brief Scales the matrix by a short integer */
+  self_type & operator *= (short val);
+  /** @brief Scales the matrix by an integer */
+  self_type & operator *= (int val);
+  /** @brief Scales the matrix by a long integer */
+  self_type & operator *= (long val);
+  /** @brief Scales the matrix by a single precision floating point value */
+  self_type & operator *= (float val);
+  /** @brief Scales the matrix by a double precision floating point value */
+  self_type & operator *= (double val);
+
+  /** @brief Scales the matrix by a char (8-bit integer) */
+  self_type & operator /= (char val);
+  /** @brief Scales the matrix by a short integer */
+  self_type & operator /= (short val);
+  /** @brief Scales the matrix by an integer */
+  self_type & operator /= (int val);
+  /** @brief Scales the matrix by a long integer */
+  self_type & operator /= (long val);
+  /** @brief Scales the matrix by a single precision floating point value */
+  self_type & operator /= (float val);
+  /** @brief Scales the matrix by a double precision floating point value */
+  self_type & operator /= (double val);
+
+  /** @brief Sign flip for the matrix. Emulated to be equivalent to -1.0 * matrix */
+  matrix_expression<const self_type, const NumericT, op_mult> operator-() const;
+  /** @brief Returns the number of rows */
+  size_type size1() const { return size1_;}
+  /** @brief Returns the number of columns */
+  size_type size2() const { return size2_; }
+  /** @brief Returns the number of rows */
+  size_type start1() const { return start1_;}
+  /** @brief Returns the number of columns */
+  size_type start2() const { return start2_; }
+  /** @brief Returns the number of rows */
+  size_type stride1() const { return stride1_;}
+  /** @brief Returns the number of columns */
+  size_type stride2() const { return stride2_; }
+  /** @brief Resets all entries to zero */
+  void clear();
+  /** @brief Returns the internal number of rows. Usually required for launching OpenCL kernels only */
+  size_type internal_size1() const { return internal_size1_; }
+  /** @brief Returns the internal number of columns. Usually required for launching OpenCL kernels only */
+  size_type internal_size2() const { return internal_size2_; }
+  /** @brief Returns the total amount of allocated memory in multiples of sizeof(NumericT) */
+  size_type internal_size() const { return internal_size1() * internal_size2(); }
+  /** @brief Returns the OpenCL handle, non-const-version */
+  handle_type & handle()       { return elements_; }
+  /** @brief Returns the OpenCL handle, const-version */
+  const handle_type & handle() const { return elements_; }
+  viennacl::memory_types memory_domain() const { return elements_.get_active_handle_id(); }
+  bool row_major() const { return row_major_; }
+  void switch_memory_context(viennacl::context new_ctx) { viennacl::backend::switch_memory_context<NumericT>(elements_, new_ctx); }
+
+protected:
+  void set_handle(viennacl::backend::mem_handle const & h);
+  void resize(size_type rows, size_type columns, bool preserve = true);
+private:
+  size_type size1_;
+  size_type size2_;
+  size_type start1_;
+  size_type start2_;
+  size_type stride1_;
+  size_type stride2_;
+  size_type internal_size1_;
+  size_type internal_size2_;
+  bool row_major_fixed_; //helper flag to make layout of matrix<T, row_major> A; persistent
+  bool row_major_;
+  handle_type elements_;
+}; //matrix
+
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/detail/vector_def.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/detail/vector_def.hpp b/native-viennaCL/src/main/cpp/viennacl/detail/vector_def.hpp
new file mode 100644
index 0000000..4624b76
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/detail/vector_def.hpp
@@ -0,0 +1,349 @@
+#ifndef VIENNACL_DETAIL_VECTOR_DEF_HPP_
+#define VIENNACL_DETAIL_VECTOR_DEF_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/detail/vector_def.hpp
+    @brief Forward declarations of the implicit_vector_base, vector_base class.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/entry_proxy.hpp"
+
+namespace viennacl
+{
+
+/** @brief Common base class for representing vectors where the entries are not all stored explicitly.
+  *
+  * Typical examples are zero_vector or scalar_vector.
+  */
+template<typename NumericT>
+class implicit_vector_base
+{
+protected:
+  implicit_vector_base(vcl_size_t s, vcl_size_t i, NumericT v, viennacl::context ctx) : size_(s), index_(std::make_pair(true,i)), value_(v), ctx_(ctx){ }
+  implicit_vector_base(vcl_size_t s, NumericT v, viennacl::context ctx) : size_(s), index_(std::make_pair(false,0)), value_(v), ctx_(ctx){ }
+
+public:
+  typedef NumericT const & const_reference;
+  typedef NumericT cpu_value_type;
+
+  viennacl::context context() const { return ctx_; }
+  vcl_size_t size() const { return size_; }
+  cpu_value_type  value() const { return value_; }
+  vcl_size_t index() const { return index_.second; }
+  bool has_index() const { return index_.first; }
+
+  cpu_value_type operator()(vcl_size_t i) const
+  {
+    if (index_.first)
+      return (i==index_.second)?value_:0;
+    return value_;
+  }
+
+  cpu_value_type operator[](vcl_size_t i) const
+  {
+    if (index_.first)
+      return (i==index_.second)?value_:0;
+    return
+        value_;
+  }
+
+protected:
+  vcl_size_t size_;
+  std::pair<bool, vcl_size_t> index_;
+  NumericT value_;
+  viennacl::context ctx_;
+};
+
+/** @brief Represents a vector consisting of 1 at a given index and zeros otherwise.*/
+template<typename NumericT>
+struct unit_vector : public implicit_vector_base<NumericT>
+{
+  unit_vector(vcl_size_t s, vcl_size_t ind, viennacl::context ctx = viennacl::context()) : implicit_vector_base<NumericT>(s, ind, 1, ctx)
+  {
+    assert( (ind < s) && bool("Provided index out of range!") );
+  }
+};
+
+
+/** @brief Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+template<typename NumericT>
+struct scalar_vector : public implicit_vector_base<NumericT>
+{
+  scalar_vector(vcl_size_t s, NumericT val, viennacl::context ctx = viennacl::context()) : implicit_vector_base<NumericT>(s, val, ctx) {}
+};
+
+template<typename NumericT>
+struct zero_vector : public scalar_vector<NumericT>
+{
+  zero_vector(vcl_size_t s, viennacl::context ctx = viennacl::context()) : scalar_vector<NumericT>(s, 0, ctx){}
+};
+
+
+/** @brief Common base class for dense vectors, vector ranges, and vector slices.
+  *
+  * @tparam NumericT   The floating point type, either 'float' or 'double'
+  */
+template<class NumericT, typename SizeT /* see forwards.h for default type */, typename DistanceT /* see forwards.h for default type */>
+class vector_base
+{
+  typedef vector_base<NumericT, SizeT, DistanceT>         self_type;
+
+public:
+  typedef scalar<NumericT>                                value_type;
+  typedef NumericT                                        cpu_value_type;
+  typedef viennacl::backend::mem_handle                     handle_type;
+  typedef SizeT                                          size_type;
+  typedef DistanceT                                      difference_type;
+  typedef const_vector_iterator<NumericT, 1>              const_iterator;
+  typedef vector_iterator<NumericT, 1>                    iterator;
+
+  /** @brief Returns the length of the vector (cf. std::vector)  */
+  size_type size() const { return size_; }
+  /** @brief Returns the internal length of the vector, which is given by size() plus the extra memory due to padding the memory with zeros up to a multiple of 'AlignmentV' */
+  size_type internal_size() const { return internal_size_; }
+  /** @brief Returns the offset within the buffer  */
+  size_type start() const { return start_; }
+  /** @brief Returns the stride within the buffer (in multiples of sizeof(NumericT)) */
+  size_type stride() const { return stride_; }
+  /** @brief Returns true is the size is zero */
+  bool empty() const { return size_ == 0; }
+  /** @brief Returns the memory handle. */
+  const handle_type & handle() const { return elements_; }
+  /** @brief Returns the memory handle. */
+  handle_type & handle() { return elements_; }
+  viennacl::memory_types memory_domain() const { return elements_.get_active_handle_id();  }
+
+  /** @brief Default constructor in order to be compatible with various containers.
+    */
+  explicit vector_base();
+
+  /** @brief An explicit constructor for wrapping an existing vector into a vector_range or vector_slice.
+     *
+     * @param h          The existing memory handle from a vector/vector_range/vector_slice
+     * @param vec_size   The length (i.e. size) of the buffer
+     * @param vec_start  The offset from the beginning of the buffer identified by 'h'
+     * @param vec_stride Increment between two elements in the original buffer (in multiples of NumericT)
+    */
+  explicit vector_base(viennacl::backend::mem_handle & h, size_type vec_size, size_type vec_start, size_type vec_stride);
+
+  /** @brief Creates a vector and allocates the necessary memory */
+  explicit vector_base(size_type vec_size, viennacl::context ctx = viennacl::context());
+
+  // CUDA or host memory:
+  explicit vector_base(NumericT * ptr_to_mem, viennacl::memory_types mem_type, size_type vec_size, vcl_size_t start = 0, size_type stride = 1);
+
+#ifdef VIENNACL_WITH_OPENCL
+  /** @brief Create a vector from existing OpenCL memory
+    *
+    * Note: The provided memory must take an eventual AlignmentV into account, i.e. existing_mem must be at least of size internal_size()!
+    * This is trivially the case with the default alignment, but should be considered when using vector<> with an alignment parameter not equal to 1.
+    *
+    * @param existing_mem   An OpenCL handle representing the memory
+    * @param vec_size       The size of the vector.
+    */
+  explicit vector_base(cl_mem existing_mem, size_type vec_size, size_type start = 0, size_type stride = 1, viennacl::context ctx = viennacl::context());
+#endif
+
+  template<typename LHS, typename RHS, typename OP>
+  explicit vector_base(vector_expression<const LHS, const RHS, OP> const & proxy);
+
+  // Copy CTOR:
+  vector_base(const self_type & other);
+
+  // Conversion CTOR:
+  template<typename OtherNumericT>
+  vector_base(const vector_base<OtherNumericT> & v1);
+
+  /** @brief Assignment operator. Other vector needs to be of the same size, or this vector is not yet initialized.
+    */
+  self_type & operator=(const self_type & vec);
+  /** @brief Implementation of the operation v1 = v2 @ alpha, where @ denotes either multiplication or division, and alpha is either a CPU or a GPU scalar
+    * @param proxy  An expression template proxy class.
+    */
+  template<typename LHS, typename RHS, typename OP>
+  self_type & operator=(const vector_expression<const LHS, const RHS, OP> & proxy);
+  /** @brief Converts a vector of a different numeric type to the current numeric type */
+  template<typename OtherNumericT>
+  self_type &  operator = (const vector_base<OtherNumericT> & v1);
+  /** @brief Creates the vector from the supplied unit vector. */
+  self_type & operator = (unit_vector<NumericT> const & v);
+  /** @brief Creates the vector from the supplied zero vector. */
+  self_type & operator = (zero_vector<NumericT> const & v);
+  /** @brief Creates the vector from the supplied scalar vector. */
+  self_type & operator = (scalar_vector<NumericT> const & v);
+
+
+  ///////////////////////////// Matrix Vector interaction start ///////////////////////////////////
+  /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a dense matrix.
+    * @param proxy An expression template proxy class
+    */
+  self_type & operator=(const viennacl::vector_expression< const matrix_base<NumericT>, const vector_base<NumericT>, viennacl::op_prod> & proxy);
+
+  //transposed_matrix_proxy:
+  /** @brief Operator overload for v1 = trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
+    * @param proxy An expression template proxy class
+    */
+  self_type & operator=(const vector_expression< const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans >,
+                        const vector_base<NumericT>,
+                        op_prod> & proxy);
+
+  ///////////////////////////// Matrix Vector interaction end ///////////////////////////////////
+
+
+  //read-write access to an element of the vector
+  /** @brief Read-write access to a single element of the vector */
+  entry_proxy<NumericT> operator()(size_type index);
+  /** @brief Read-write access to a single element of the vector */
+  entry_proxy<NumericT> operator[](size_type index);
+  /** @brief Read access to a single element of the vector */
+  const_entry_proxy<NumericT> operator()(size_type index) const;
+  /** @brief Read access to a single element of the vector */
+  const_entry_proxy<NumericT> operator[](size_type index) const;
+  self_type & operator += (const self_type & vec);
+  self_type & operator -= (const self_type & vec);
+
+  /** @brief Scales a vector (or proxy) by a char (8-bit integer) */
+  self_type & operator *= (char val);
+  /** @brief Scales a vector (or proxy) by a short integer */
+  self_type & operator *= (short val);
+  /** @brief Scales a vector (or proxy) by an integer */
+  self_type & operator *= (int val);
+  /** @brief Scales a vector (or proxy) by a long integer */
+  self_type & operator *= (long val);
+  /** @brief Scales a vector (or proxy) by a single precision floating point value */
+  self_type & operator *= (float val);
+  /** @brief Scales a vector (or proxy) by a double precision floating point value */
+  self_type & operator *= (double val);
+
+
+  /** @brief Scales a vector (or proxy) by a char (8-bit integer) */
+  self_type & operator /= (char val);
+  /** @brief Scales a vector (or proxy) by a short integer */
+  self_type & operator /= (short val);
+  /** @brief Scales a vector (or proxy) by an integer */
+  self_type & operator /= (int val);
+  /** @brief Scales a vector (or proxy) by a long integer */
+  self_type & operator /= (long val);
+  /** @brief Scales a vector (or proxy) by a single precision floating point value */
+  self_type & operator /= (float val);
+  /** @brief Scales a vector (or proxy) by a double precision floating point value */
+  self_type & operator /= (double val);
+
+  /** @brief Scales the vector by a char (8-bit integer) 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_mult>
+  operator * (char value) const;
+  /** @brief Scales the vector by a short integer 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_mult>
+  operator * (short value) const;
+  /** @brief Scales the vector by an integer 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_mult>
+  operator * (int value) const;
+  /** @brief Scales the vector by a long integer 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_mult>
+  operator * (long value) const;
+  /** @brief Scales the vector by a single precision floating point value 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_mult>
+  operator * (float value) const;
+  /** @brief Scales the vector by a double precision floating point value 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_mult>
+  operator * (double value) const;
+
+  /** @brief Scales the vector by a char (8-bit integer) 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_div>
+  operator / (char value) const;
+  /** @brief Scales the vector by a short integer 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_div>
+  operator / (short value) const;
+  /** @brief Scales the vector by an integer 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_div>
+  operator / (int value) const;
+  /** @brief Scales the vector by a long integer 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_div>
+  operator / (long value) const;
+  /** @brief Scales the vector by a single precision floating point value 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_div>
+  operator / (float value) const;
+  /** @brief Scales the vector by a double precision floating point value 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_div>
+  operator / (double value) const;
+
+  /** @brief Sign flip for the vector. Emulated to be equivalent to -1.0 * vector */
+  vector_expression<const self_type, const NumericT, op_mult> operator-() const;
+  /** @brief Returns an iterator pointing to the beginning of the vector  (STL like)*/
+  iterator begin();
+  /** @brief Returns an iterator pointing to the end of the vector (STL like)*/
+  iterator end();
+  /** @brief Returns a const-iterator pointing to the beginning of the vector (STL like)*/
+  const_iterator begin() const;
+  /** @brief Returns a const-iterator pointing to the end of the vector (STL like)*/
+  const_iterator end() const;
+  /** @brief Swaps the entries of the two vectors */
+  self_type & swap(self_type & other);
+
+  /** @brief Resets all entries to zero. Does not change the size of the vector. */
+  void clear();
+
+protected:
+
+  void set_handle(viennacl::backend::mem_handle const & h) {  elements_ = h; }
+
+  /** @brief Swaps the handles of two vectors by swapping the OpenCL handles only, no data copy */
+  self_type & fast_swap(self_type & other);
+
+  /** @brief Pads vectors with alignment > 1 with trailing zeros if the internal size is larger than the visible size */
+  void pad();
+
+  void switch_memory_context(viennacl::context new_ctx);
+
+  //TODO: Think about implementing the following public member functions
+  //void insert_element(unsigned int i, NumericT val){}
+  //void erase_element(unsigned int i){}
+
+  //enlarge or reduce allocated memory and set unused memory to zero
+  /** @brief Resizes the allocated memory for the vector. Pads the memory to be a multiple of 'AlignmentV'
+    *
+    *  @param new_size  The new size of the vector
+    *  @param preserve  If true, old entries of the vector are preserved, otherwise eventually discarded.
+    */
+  void resize(size_type new_size, bool preserve = true);
+
+  /** @brief Resizes the allocated memory for the vector. Convenience function for setting an OpenCL context in case reallocation is needed
+    *
+    *  @param new_size  The new size of the vector
+    *  @param ctx       The context within which the new memory should be allocated
+    *  @param preserve  If true, old entries of the vector are preserved, otherwise eventually discarded.
+    */
+  void resize(size_type new_size, viennacl::context ctx, bool preserve = true);
+private:
+
+  void resize_impl(size_type new_size, viennacl::context ctx, bool preserve = true);
+
+  size_type       size_;
+  size_type       start_;
+  size_type       stride_;
+  size_type       internal_size_;
+  handle_type elements_;
+}; //vector_base
+
+/** \endcond */
+
+} // namespace viennacl
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/common.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/common.hpp
new file mode 100644
index 0000000..3b6ec76
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/common.hpp
@@ -0,0 +1,219 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_COMMON_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_COMMON_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/device_specific/builtin_database/common.hpp
+*
+* Common routines such as device lookup for the built-in device database.
+*/
+
+#include "viennacl/ocl/device_utils.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/device_specific/forwards.h"
+
+namespace viennacl
+{
+namespace device_specific
+{
+namespace builtin_database
+{
+
+using scheduler::FLOAT_TYPE;
+using scheduler::DOUBLE_TYPE;
+using namespace viennacl::ocl;
+
+template<class ParamT>
+class database_type
+{
+public:
+
+  //Because it would be too easy to use nested maps directly.
+  //THANKS, VISUAL STUDIO.
+  struct expression_t{ typedef std::map<scheduler::statement_node_numeric_type, ParamT> map_t; map_t d; };
+  struct device_name_t{ typedef std::map<device_name_type, expression_t> map_t; map_t d; };
+  struct device_architecture_t{ typedef std::map<ocl::device_architecture_family, device_name_t> map_t; map_t d; };
+  struct device_type_t{ typedef std::map<device_type, device_architecture_t> map_t; map_t d; };
+  struct type{ typedef std::map<vendor_id_type, device_type_t> map_t; map_t d; };
+  type map;
+
+  database_type<ParamT> & operator()(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, scheduler::statement_node_numeric_type p4, ParamT const & p5)
+  {
+    map.d[p0].d[p1].d[p2].d[p3].d.insert(std::make_pair(p4, p5));
+    return *this;
+  }
+
+  database_type<ParamT> & add_1B(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, ParamT const & p5)
+  {
+    return (*this)(p0, p1, p2, p3, scheduler::CHAR_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::UCHAR_TYPE, p5);
+  }
+
+  database_type<ParamT> & add_2B(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, ParamT const & p5)
+  {
+    return (*this)(p0, p1, p2, p3, scheduler::SHORT_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::USHORT_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::HALF_TYPE, p5);
+  }
+
+  database_type<ParamT> & add_4B(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, ParamT const & p5)
+  {
+    return (*this)(p0, p1, p2, p3, scheduler::INT_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::UINT_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::FLOAT_TYPE, p5);
+  }
+
+  database_type<ParamT> & add_8B(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, ParamT const & p5)
+  {
+    return (*this)(p0, p1, p2, p3, scheduler::LONG_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::ULONG_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::DOUBLE_TYPE, p5);
+  }
+
+  ParamT const & at(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, scheduler::statement_node_numeric_type p4) const
+  {
+    return viennacl::device_specific::at(
+           viennacl::device_specific::at(
+           viennacl::device_specific::at(
+           viennacl::device_specific::at(
+             viennacl::device_specific::at(map.d, p0).d,
+           p1).d,
+           p2).d,
+         p3).d,
+         p4);
+  }
+
+
+};
+
+
+template<typename StringT>
+StringT get_mapped_device_name(StringT const & device_name, vendor_id_type vendor_id)
+{
+  if (vendor_id == viennacl::ocl::nvidia_id)
+  {
+    vcl_size_t found=0;
+    if ((found = device_name.find("GeForce",0)) != std::string::npos)
+    {
+      if ((found = device_name.find_first_of("123456789", found)) != std::string::npos)
+      {
+        switch (device_name[found]) // GeForce 400 series mapped to GTX 470, GeForce 500 series mapped to GTX 580:
+        {
+        case '4' : return "GeForce GTX 470";
+        case '5' : return "GeForce GTX 570";
+        default: break; // since there is only one Kepler and one Maxwell device in the database, fallback works properly
+        }
+      }
+    }
+    else if ((found = device_name.find("Tesla",0)) != std::string::npos) // map Kepler-based Teslas to K20m
+    {
+      if (device_name.find("Tesla C10",0) != std::string::npos)
+        return "Tesla C2050";
+      else if (device_name.find("Tesla S10",0) != std::string::npos)
+        return "Tesla C2050";
+      else if (device_name.find("Tesla M20",0) != std::string::npos)
+        return "Tesla C2050";
+      else if (device_name.find("Tesla S20",0) != std::string::npos)
+        return "Tesla C2050";
+      else if (device_name.find("Tesla K",0) != std::string::npos) // all Kepler-based Teslas
+        return "Tesla K20m";
+    }
+  }
+
+  return device_name;
+}
+
+/** @brief Get the profile for a device and a descriptor
+*
+* There are built-in defaults for CPUs, Accelerators, GPUs.
+*/
+template<class NumericT, class ParamT>
+inline ParamT const & get_parameters(database_type<ParamT> const & database, viennacl::ocl::device const & device)
+{
+  scheduler::statement_node_numeric_type numeric_type = scheduler::statement_node_numeric_type(scheduler::result_of::numeric_type_id<NumericT>::value);
+
+  device_type dev_type = device.type() & device_type(0xFE); // chop off 'default' characterization
+  vendor_id_type vendor_id = device.vendor_id();
+  ocl::device_architecture_family device_architecture = device.architecture_family();
+  std::string const & device_name = device.name();
+
+
+  /*-Vendor ID-*/
+  //  std::cout << "Looking up vendor ID..." << std::endl;
+  typename database_type<ParamT>::type::map_t::const_iterator vendor_it = database.map.d.find(vendor_id);
+  //Vendor not recognized =>  device type default
+  if (vendor_it==database.map.d.end())
+    return database.at(ocl::unknown_id, dev_type, ocl::unknown, "", numeric_type);
+
+  /*-Device Type-*/
+  //  std::cout << "Looking up device type..." << std::endl;
+  typename database_type<ParamT>::device_type_t::map_t::const_iterator device_type_it = vendor_it->second.d.find(dev_type);
+  //Device type not recognized for this vendor => device type default
+  if (device_type_it==vendor_it->second.d.end())
+    return database.at(ocl::unknown_id, dev_type, ocl::unknown, "", numeric_type);
+
+  /*-Device Architecture-*/
+  //  std::cout << "Looking up device architecture..." << std::endl;
+  typename database_type<ParamT>::device_architecture_t::map_t::const_iterator architecture_it = device_type_it->second.d.find(device_architecture);
+  //Architecture not found. We try to find the closest architecture available.
+  if (architecture_it==device_type_it->second.d.end())
+  {
+    typename database_type<ParamT>::device_architecture_t::map_t::const_iterator current_it = device_type_it->second.d.begin();
+    architecture_it = current_it;
+    int closest_arch = current_it->first - device_architecture;
+    while (current_it!=device_type_it->second.d.end())
+    {
+      int arch_diff = std::abs(static_cast<int>(current_it->first) - static_cast<int>(device_architecture));
+      if (arch_diff < closest_arch)
+      {
+        architecture_it = current_it;
+        closest_arch = arch_diff;
+      }
+      current_it++;
+    }
+  }
+
+  /*-Device Name-*/
+  std::string mapped_device_name = get_mapped_device_name(device_name, device.vendor_id());
+
+  typename database_type<ParamT>::device_name_t::map_t::const_iterator device_name_it = architecture_it->second.d.find(mapped_device_name);
+  //Name not found. We just take the first device for the architecture
+  if (device_name_it==architecture_it->second.d.end())
+  {
+    device_name_it = architecture_it->second.d.begin();
+  }
+
+  //  std::cout << "Looking up expression name.." << std::endl;
+  /*-Expression-*/
+  typename database_type<ParamT>::expression_t::map_t::const_iterator expression_it = device_name_it->second.d.find(numeric_type);
+  //Expression not found => Vendor default
+  if (expression_it==device_name_it->second.d.end())
+    return database.at(ocl::unknown_id, dev_type, ocl::unknown, "", numeric_type);
+
+  //  std::cout << "Device found in the database! Getting profile..." << std::endl;
+  //Everything okay. Return specific profile//
+  return expression_it->second;
+}
+
+
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp
new file mode 100644
index 0000000..5eede89
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp
@@ -0,0 +1,85 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_ACCELERATOR_FALLBACK_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_ACCELERATOR_FALLBACK_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace accelerator{
+namespace fallback{
+
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+
+}
+}
+}
+}
+}
+}
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp
new file mode 100644
index 0000000..ffaa9db
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_CPU_FALLBACK_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_CPU_FALLBACK_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace cpu{
+namespace fallback{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+
+}
+}
+}
+}
+}
+}
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp
new file mode 100644
index 0000000..b0e3a1c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp
@@ -0,0 +1,64 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_EVERGREEN_CEDAR_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_EVERGREEN_CEDAR_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace evergreen{
+namespace cedar{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cedar", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cedar", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cedar", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cedar", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp
new file mode 100644
index 0000000..d1179b8
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp
@@ -0,0 +1,65 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_EVERGREEN_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_EVERGREEN_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace evergreen{
+namespace cypress{
+
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cypress", matrix_product_template::parameters_type(1,8,16,32,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cypress", matrix_product_template::parameters_type(1,8,16,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cypress", matrix_product_template::parameters_type(4,32,4,8,4,1,4,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cypress", matrix_product_template::parameters_type(1,8,16,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp
new file mode 100644
index 0000000..2805a5c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp
@@ -0,0 +1,64 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_BARTS_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_BARTS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace northern_islands{
+namespace barts{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Barts", matrix_product_template::parameters_type(1,2,2,128,2,2,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Barts", matrix_product_template::parameters_type(1,8,8,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,4,32));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Barts", matrix_product_template::parameters_type(1,2,1,64,2,1,2,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Barts", matrix_product_template::parameters_type(1,8,8,8,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp
new file mode 100644
index 0000000..018839e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp
@@ -0,0 +1,64 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_DEVASTATOR_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_DEVASTATOR_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace northern_islands{
+namespace devastator{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Devastator", matrix_product_template::parameters_type(1,8,16,8,2,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Devastator", matrix_product_template::parameters_type(1,16,16,8,2,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Devastator", matrix_product_template::parameters_type(2,64,16,4,2,1,2,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Devastator", matrix_product_template::parameters_type(1,16,16,8,1,2,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,16));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp
new file mode 100644
index 0000000..9e1db25
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp
@@ -0,0 +1,64 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_SCRAPPER_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_SCRAPPER_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace northern_islands{
+namespace scrapper{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Scrapper", matrix_product_template::parameters_type(1,8,16,32,2,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Scrapper", matrix_product_template::parameters_type(1,8,16,8,2,2,1,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Scrapper", matrix_product_template::parameters_type(2,32,2,4,2,1,2,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Scrapper", matrix_product_template::parameters_type(1,16,16,8,2,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,16));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif


[23/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp
new file mode 100644
index 0000000..fb89742
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp
@@ -0,0 +1,738 @@
+#ifndef VIENNACL_GMRES_HPP_
+#define VIENNACL_GMRES_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/gmres.hpp
+    @brief Implementations of the generalized minimum residual method are in this file.
+*/
+
+#include <vector>
+#include <cmath>
+#include <limits>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/traits/clear.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/context.hpp"
+#include "viennacl/meta/result_of.hpp"
+
+#include "viennacl/linalg/iterative_operations.hpp"
+#include "viennacl/vector_proxy.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for the solver GMRES. Used for supplying solver parameters and for dispatching the solve() function
+*/
+class gmres_tag       //generalized minimum residual
+{
+public:
+  /** @brief The constructor
+  *
+  * @param tol            Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||)
+  * @param max_iterations The maximum number of iterations (including restarts
+  * @param krylov_dim     The maximum dimension of the Krylov space before restart (number of restarts is found by max_iterations / krylov_dim)
+  */
+  gmres_tag(double tol = 1e-10, unsigned int max_iterations = 300, unsigned int krylov_dim = 20)
+   : tol_(tol), abs_tol_(0), iterations_(max_iterations), krylov_dim_(krylov_dim), iters_taken_(0) {}
+
+  /** @brief Returns the relative tolerance */
+  double tolerance() const { return tol_; }
+
+  /** @brief Returns the absolute tolerance */
+  double abs_tolerance() const { return abs_tol_; }
+  /** @brief Sets the absolute tolerance */
+  void abs_tolerance(double new_tol) { if (new_tol >= 0) abs_tol_ = new_tol; }
+
+  /** @brief Returns the maximum number of iterations */
+  unsigned int max_iterations() const { return iterations_; }
+  /** @brief Returns the maximum dimension of the Krylov space before restart */
+  unsigned int krylov_dim() const { return krylov_dim_; }
+  /** @brief Returns the maximum number of GMRES restarts */
+  unsigned int max_restarts() const
+  {
+    unsigned int ret = iterations_ / krylov_dim_;
+    if (ret > 0 && (ret * krylov_dim_ == iterations_) )
+      return ret - 1;
+    return ret;
+  }
+
+  /** @brief Return the number of solver iterations: */
+  unsigned int iters() const { return iters_taken_; }
+  /** @brief Set the number of solver iterations (should only be modified by the solver) */
+  void iters(unsigned int i) const { iters_taken_ = i; }
+
+  /** @brief Returns the estimated relative error at the end of the solver run */
+  double error() const { return last_error_; }
+  /** @brief Sets the estimated relative error at the end of the solver run */
+  void error(double e) const { last_error_ = e; }
+
+private:
+  double tol_;
+  double abs_tol_;
+  unsigned int iterations_;
+  unsigned int krylov_dim_;
+
+  //return values from solver
+  mutable unsigned int iters_taken_;
+  mutable double last_error_;
+};
+
+namespace detail
+{
+
+  template<typename SrcVectorT, typename DestVectorT>
+  void gmres_copy_helper(SrcVectorT const & src, DestVectorT & dest, vcl_size_t len, vcl_size_t start = 0)
+  {
+    for (vcl_size_t i=0; i<len; ++i)
+      dest[start+i] = src[start+i];
+  }
+
+  template<typename NumericT, typename DestVectorT>
+  void gmres_copy_helper(viennacl::vector<NumericT> const & src, DestVectorT & dest, vcl_size_t len, vcl_size_t start = 0)
+  {
+    typedef typename viennacl::vector<NumericT>::difference_type   difference_type;
+    viennacl::copy( src.begin() + static_cast<difference_type>(start),
+                    src.begin() + static_cast<difference_type>(start + len),
+                   dest.begin() + static_cast<difference_type>(start));
+  }
+
+  /** @brief Computes the householder vector 'hh_vec' which rotates 'input_vec' such that all entries below the j-th entry of 'v' become zero.
+    *
+    * @param input_vec       The input vector
+    * @param hh_vec          The householder vector defining the relection (I - beta * hh_vec * hh_vec^T)
+    * @param beta            The coefficient beta in (I - beta  * hh_vec * hh_vec^T)
+    * @param mu              The norm of the input vector part relevant for the reflection: norm_2(input_vec[j:size])
+    * @param j               Index of the last nonzero index in 'input_vec' after applying the reflection
+  */
+  template<typename VectorT, typename NumericT>
+  void gmres_setup_householder_vector(VectorT const & input_vec, VectorT & hh_vec, NumericT & beta, NumericT & mu, vcl_size_t j)
+  {
+    NumericT input_j = input_vec(j);
+
+    // copy entries from input vector to householder vector:
+    detail::gmres_copy_helper(input_vec, hh_vec, viennacl::traits::size(hh_vec) - (j+1), j+1);
+
+    NumericT sigma = viennacl::linalg::norm_2(hh_vec);
+    sigma *= sigma;
+
+    if (sigma <= 0)
+    {
+      beta = 0;
+      mu = input_j;
+    }
+    else
+    {
+      mu = std::sqrt(sigma + input_j*input_j);
+
+      NumericT hh_vec_0 = (input_j <= 0) ? (input_j - mu) : (-sigma / (input_j + mu));
+
+      beta = NumericT(2) * hh_vec_0 * hh_vec_0 / (sigma + hh_vec_0 * hh_vec_0);
+
+      //divide hh_vec by its diagonal element hh_vec_0
+      hh_vec /= hh_vec_0;
+      hh_vec[j] = NumericT(1);
+    }
+  }
+
+  // Apply (I - beta h h^T) to x (Householder reflection with Householder vector h)
+  template<typename VectorT, typename NumericT>
+  void gmres_householder_reflect(VectorT & x, VectorT const & h, NumericT beta)
+  {
+    NumericT hT_in_x = viennacl::linalg::inner_prod(h, x);
+    x -= (beta * hT_in_x) * h;
+  }
+
+
+  /** @brief Implementation of a pipelined GMRES solver without preconditioner
+  *
+  * Following algorithm 2.1 proposed by Walker in "A Simpler GMRES", but uses classical Gram-Schmidt instead of modified Gram-Schmidt for better parallelization.
+  * Uses some pipelining techniques for minimizing host-device transfer
+  *
+  * @param A            The system matrix
+  * @param rhs          The load vector
+  * @param tag          Solver configuration tag
+  * @param monitor      A callback routine which is called at each GMRES restart
+  * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data
+  * @return The result vector
+  */
+  template <typename MatrixType, typename ScalarType>
+  viennacl::vector<ScalarType> pipelined_solve(MatrixType const & A,
+                                               viennacl::vector<ScalarType> const & rhs,
+                                               gmres_tag const & tag,
+                                               viennacl::linalg::no_precond,
+                                               bool (*monitor)(viennacl::vector<ScalarType> const &, ScalarType, void*) = NULL,
+                                               void *monitor_data = NULL)
+  {
+    viennacl::vector<ScalarType> residual(rhs);
+    viennacl::vector<ScalarType> result = viennacl::zero_vector<ScalarType>(rhs.size(), viennacl::traits::context(rhs));
+
+    viennacl::vector<ScalarType> device_krylov_basis(rhs.internal_size() * tag.krylov_dim(), viennacl::traits::context(rhs)); // not using viennacl::matrix here because of spurious padding in column number
+    viennacl::vector<ScalarType> device_buffer_R(tag.krylov_dim()*tag.krylov_dim(), viennacl::traits::context(rhs));
+    std::vector<ScalarType>      host_buffer_R(device_buffer_R.size());
+
+    vcl_size_t buffer_size_per_vector = 128;
+    vcl_size_t num_buffer_chunks      = 3;
+    viennacl::vector<ScalarType> device_inner_prod_buffer = viennacl::zero_vector<ScalarType>(num_buffer_chunks*buffer_size_per_vector, viennacl::traits::context(rhs)); // temporary buffer
+    viennacl::vector<ScalarType> device_r_dot_vk_buffer   = viennacl::zero_vector<ScalarType>(buffer_size_per_vector * tag.krylov_dim(), viennacl::traits::context(rhs)); // holds result of first reduction stage for <r, v_k> on device
+    viennacl::vector<ScalarType> device_vi_in_vk_buffer   = viennacl::zero_vector<ScalarType>(buffer_size_per_vector * tag.krylov_dim(), viennacl::traits::context(rhs)); // holds <v_i, v_k> for i=0..k-1 on device
+    viennacl::vector<ScalarType> device_values_xi_k       = viennacl::zero_vector<ScalarType>(tag.krylov_dim(), viennacl::traits::context(rhs)); // holds values \xi_k = <r, v_k> on device
+    std::vector<ScalarType>      host_r_dot_vk_buffer(device_r_dot_vk_buffer.size());
+    std::vector<ScalarType>      host_values_xi_k(tag.krylov_dim());
+    std::vector<ScalarType>      host_values_eta_k_buffer(tag.krylov_dim());
+    std::vector<ScalarType>      host_update_coefficients(tag.krylov_dim());
+
+    ScalarType norm_rhs = viennacl::linalg::norm_2(residual);
+    ScalarType rho_0 = norm_rhs;
+    ScalarType rho = ScalarType(1);
+
+    tag.iters(0);
+
+    for (unsigned int restart_count = 0; restart_count <= tag.max_restarts(); ++restart_count)
+    {
+      //
+      // prepare restart:
+      //
+      if (restart_count > 0)
+      {
+        // compute new residual without introducing a temporary for A*x:
+        residual = viennacl::linalg::prod(A, result);
+        residual = rhs - residual;
+
+        rho_0 = viennacl::linalg::norm_2(residual);
+      }
+
+      if (rho_0 <= ScalarType(tag.abs_tolerance()))  // trivial right hand side?
+        break;
+
+      residual /= rho_0;
+      rho = ScalarType(1);
+
+      // check for convergence:
+      if (rho_0 / norm_rhs < tag.tolerance() || rho_0 < tag.abs_tolerance())
+        break;
+
+      //
+      // minimize in Krylov basis:
+      //
+      vcl_size_t k = 0;
+      for (k = 0; k < static_cast<vcl_size_t>(tag.krylov_dim()); ++k)
+      {
+        if (k == 0)
+        {
+          // compute v0 = A*r and perform first reduction stage for ||v0||
+          viennacl::vector_range<viennacl::vector<ScalarType> > v0(device_krylov_basis, viennacl::range(0, rhs.size()));
+          viennacl::linalg::pipelined_gmres_prod(A, residual, v0, device_inner_prod_buffer);
+
+          // Normalize v_1 and compute first reduction stage for <r, v_0> in device_r_dot_vk_buffer:
+          viennacl::linalg::pipelined_gmres_normalize_vk(v0, residual,
+                                                         device_buffer_R, k*tag.krylov_dim() + k,
+                                                         device_inner_prod_buffer, device_r_dot_vk_buffer,
+                                                         buffer_size_per_vector, k*buffer_size_per_vector);
+        }
+        else
+        {
+          // compute v0 = A*r and perform first reduction stage for ||v0||
+          viennacl::vector_range<viennacl::vector<ScalarType> > vk        (device_krylov_basis, viennacl::range( k   *rhs.internal_size(),  k   *rhs.internal_size() + rhs.size()));
+          viennacl::vector_range<viennacl::vector<ScalarType> > vk_minus_1(device_krylov_basis, viennacl::range((k-1)*rhs.internal_size(), (k-1)*rhs.internal_size() + rhs.size()));
+          viennacl::linalg::pipelined_gmres_prod(A, vk_minus_1, vk, device_inner_prod_buffer);
+
+          //
+          // Gram-Schmidt, stage 1: compute first reduction stage of <v_i, v_k>
+          //
+          viennacl::linalg::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, rhs.size(), rhs.internal_size(), k, device_vi_in_vk_buffer, buffer_size_per_vector);
+
+          //
+          // Gram-Schmidt, stage 2: compute second reduction stage of <v_i, v_k> and use that to compute v_k -= sum_i <v_i, v_k> v_i.
+          //                        Store <v_i, v_k> in R-matrix and compute first reduction stage for ||v_k||
+          //
+          viennacl::linalg::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, rhs.size(), rhs.internal_size(), k,
+                                                                device_vi_in_vk_buffer,
+                                                                device_buffer_R, tag.krylov_dim(),
+                                                                device_inner_prod_buffer, buffer_size_per_vector);
+
+          //
+          // Normalize v_k and compute first reduction stage for <r, v_k> in device_r_dot_vk_buffer:
+          //
+          viennacl::linalg::pipelined_gmres_normalize_vk(vk, residual,
+                                                         device_buffer_R, k*tag.krylov_dim() + k,
+                                                         device_inner_prod_buffer, device_r_dot_vk_buffer,
+                                                         buffer_size_per_vector, k*buffer_size_per_vector);
+        }
+      }
+
+      //
+      // Run reduction to obtain the values \xi_k = <r, v_k>.
+      // Note that unlike Algorithm 2.1 in Walker: "A Simpler GMRES", we do not update the residual
+      //
+      viennacl::fast_copy(device_r_dot_vk_buffer.begin(), device_r_dot_vk_buffer.end(), host_r_dot_vk_buffer.begin());
+      for (std::size_t i=0; i<k; ++i)
+      {
+        host_values_xi_k[i] = ScalarType(0);
+        for (std::size_t j=0; j<buffer_size_per_vector; ++j)
+          host_values_xi_k[i] += host_r_dot_vk_buffer[i*buffer_size_per_vector + j];
+      }
+
+      //
+      // Bring values in R  back to host:
+      //
+      viennacl::fast_copy(device_buffer_R.begin(), device_buffer_R.end(), host_buffer_R.begin());
+
+      //
+      // Check for premature convergence: If the diagonal element drops too far below the first norm, we're done and restrict the Krylov size accordingly.
+      //
+      vcl_size_t full_krylov_dim = k; //needed for proper access to R
+      for (std::size_t i=0; i<k; ++i)
+      {
+        if (std::fabs(host_buffer_R[i + i*k]) < tag.tolerance() * host_buffer_R[0])
+        {
+          k = i;
+          break;
+        }
+      }
+
+
+      // Compute error estimator:
+      for (std::size_t i=0; i<k; ++i)
+      {
+        tag.iters( tag.iters() + 1 ); //increase iteration counter
+
+        // check for accumulation of round-off errors for poorly conditioned systems
+        if (host_values_xi_k[i] >= rho || host_values_xi_k[i] <= -rho)
+        {
+          k = i;
+          break;  // restrict Krylov space at this point. No gain from using additional basis vectors, since orthogonality is lost.
+        }
+
+        // update error estimator
+        rho *= std::sin( std::acos(host_values_xi_k[i] / rho) );
+      }
+
+      //
+      // Solve minimization problem:
+      //
+      host_values_eta_k_buffer = host_values_xi_k;
+
+      for (int i2=static_cast<int>(k)-1; i2>-1; --i2)
+      {
+        vcl_size_t i = static_cast<vcl_size_t>(i2);
+        for (vcl_size_t j=static_cast<vcl_size_t>(i)+1; j<k; ++j)
+          host_values_eta_k_buffer[i] -= host_buffer_R[i + j*full_krylov_dim] * host_values_eta_k_buffer[j];
+
+        host_values_eta_k_buffer[i] /= host_buffer_R[i + i*full_krylov_dim];
+      }
+
+      //
+      // Update x += rho * z with z = \eta_0 * residual + sum_{i=0}^{k-1} \eta_{i+1} v_i
+      // Note that we have not updated the residual yet, hence this slightly modified as compared to the form given in Algorithm 2.1 in Walker: "A Simpler GMRES"
+      //
+      for (vcl_size_t i=0; i<k; ++i)
+        host_update_coefficients[i] = rho_0 * host_values_eta_k_buffer[i];
+
+      viennacl::fast_copy(host_update_coefficients.begin(), host_update_coefficients.end(), device_values_xi_k.begin()); //reuse device_values_xi_k_buffer here for simplicity
+
+      viennacl::linalg::pipelined_gmres_update_result(result, residual,
+                                                      device_krylov_basis, rhs.size(), rhs.internal_size(),
+                                                      device_values_xi_k, k);
+
+      tag.error( std::fabs(rho*rho_0 / norm_rhs) );
+
+      if (monitor && monitor(result, std::fabs(rho*rho_0 / norm_rhs), monitor_data))
+        break;
+    }
+
+    return result;
+  }
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::compressed_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        gmres_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::coordinate_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        gmres_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::ell_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        gmres_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::sliced_ell_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        gmres_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+  template<typename NumericT>
+  viennacl::vector<NumericT> solve_impl(viennacl::hyb_matrix<NumericT> const & A,
+                                        viennacl::vector<NumericT> const & rhs,
+                                        gmres_tag const & tag,
+                                        viennacl::linalg::no_precond,
+                                        bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+                                        void *monitor_data = NULL)
+  {
+    return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+  }
+
+
+  /** @brief Implementation of the GMRES solver.
+  *
+  * Following the algorithm proposed by Walker in "A Simpler GMRES"
+  *
+  * @param matrix       The system matrix
+  * @param rhs          The load vector
+  * @param tag          Solver configuration tag
+  * @param precond      A preconditioner. Precondition operation is done via member function apply()
+  * @param monitor      A callback routine which is called at each GMRES restart
+  * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data
+  *
+  * @return The result vector
+  */
+  template<typename MatrixT, typename VectorT, typename PreconditionerT>
+  VectorT solve_impl(MatrixT const & matrix,
+                     VectorT const & rhs,
+                     gmres_tag const & tag,
+                     PreconditionerT const & precond,
+                     bool (*monitor)(VectorT const &, typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<VectorT>::type>::type, void*) = NULL,
+                     void *monitor_data = NULL)
+  {
+    typedef typename viennacl::result_of::value_type<VectorT>::type            NumericType;
+    typedef typename viennacl::result_of::cpu_value_type<NumericType>::type    CPU_NumericType;
+
+    unsigned int problem_size = static_cast<unsigned int>(viennacl::traits::size(rhs));
+    VectorT result = rhs;
+    viennacl::traits::clear(result);
+
+    vcl_size_t krylov_dim = static_cast<vcl_size_t>(tag.krylov_dim());
+    if (problem_size < krylov_dim)
+      krylov_dim = problem_size; //A Krylov space larger than the matrix would lead to seg-faults (mathematically, error is certain to be zero already)
+
+    VectorT res = rhs;
+    VectorT v_k_tilde = rhs;
+    VectorT v_k_tilde_temp = rhs;
+
+    std::vector< std::vector<CPU_NumericType> > R(krylov_dim, std::vector<CPU_NumericType>(tag.krylov_dim()));
+    std::vector<CPU_NumericType> projection_rhs(krylov_dim);
+
+    std::vector<VectorT>          householder_reflectors(krylov_dim, rhs);
+    std::vector<CPU_NumericType>  betas(krylov_dim);
+
+    CPU_NumericType norm_rhs = viennacl::linalg::norm_2(rhs);
+
+    if (norm_rhs <= tag.abs_tolerance()) //solution is zero if RHS norm is zero
+      return result;
+
+    tag.iters(0);
+
+    for (unsigned int it = 0; it <= tag.max_restarts(); ++it)
+    {
+      //
+      // (Re-)Initialize residual: r = b - A*x (without temporary for the result of A*x)
+      //
+      res = viennacl::linalg::prod(matrix, result);  //initial guess zero
+      res = rhs - res;
+      precond.apply(res);
+
+      CPU_NumericType rho_0 = viennacl::linalg::norm_2(res);
+
+      //
+      // Check for premature convergence
+      //
+      if (rho_0 / norm_rhs < tag.tolerance() || rho_0 < tag.abs_tolerance()) // norm_rhs is known to be nonzero here
+      {
+        tag.error(rho_0 / norm_rhs);
+        return result;
+      }
+
+      //
+      // Normalize residual and set 'rho' to 1 as requested in 'A Simpler GMRES' by Walker and Zhou.
+      //
+      res /= rho_0;
+      CPU_NumericType rho = static_cast<CPU_NumericType>(1.0);
+
+
+      //
+      // Iterate up until maximal Krylove space dimension is reached:
+      //
+      vcl_size_t k = 0;
+      for (k = 0; k < krylov_dim; ++k)
+      {
+        tag.iters( tag.iters() + 1 ); //increase iteration counter
+
+        // prepare storage:
+        viennacl::traits::clear(R[k]);
+        viennacl::traits::clear(householder_reflectors[k]);
+
+        //compute v_k = A * v_{k-1} via Householder matrices
+        if (k == 0)
+        {
+          v_k_tilde = viennacl::linalg::prod(matrix, res);
+          precond.apply(v_k_tilde);
+        }
+        else
+        {
+          viennacl::traits::clear(v_k_tilde);
+          v_k_tilde[k-1] = CPU_NumericType(1);
+
+          //Householder rotations, part 1: Compute P_1 * P_2 * ... * P_{k-1} * e_{k-1}
+          for (int i = static_cast<int>(k)-1; i > -1; --i)
+            detail::gmres_householder_reflect(v_k_tilde, householder_reflectors[vcl_size_t(i)], betas[vcl_size_t(i)]);
+
+          v_k_tilde_temp = viennacl::linalg::prod(matrix, v_k_tilde);
+          precond.apply(v_k_tilde_temp);
+          v_k_tilde = v_k_tilde_temp;
+
+          //Householder rotations, part 2: Compute P_{k-1} * ... * P_{1} * v_k_tilde
+          for (vcl_size_t i = 0; i < k; ++i)
+            detail::gmres_householder_reflect(v_k_tilde, householder_reflectors[i], betas[i]);
+        }
+
+        //
+        // Compute Householder reflection for v_k_tilde such that all entries below k-th entry are zero:
+        //
+        CPU_NumericType rho_k_k = 0;
+        detail::gmres_setup_householder_vector(v_k_tilde, householder_reflectors[k], betas[k], rho_k_k, k);
+
+        //
+        // copy first k entries from v_k_tilde to R[k] in order to fill k-th column with result of
+        // P_k * v_k_tilde = (v[0], ... , v[k-1], norm(v), 0, 0, ...) =: (rho_{1,k}, rho_{2,k}, ..., rho_{k,k}, 0, ..., 0);
+        //
+        detail::gmres_copy_helper(v_k_tilde, R[k], k);
+        R[k][k] = rho_k_k;
+
+        //
+        // Update residual: r = P_k r
+        // Set zeta_k = r[k] including machine precision considerations: mathematically we have |r[k]| <= rho
+        // Set rho *= sin(acos(r[k] / rho))
+        //
+        detail::gmres_householder_reflect(res, householder_reflectors[k], betas[k]);
+
+        if (res[k] > rho) //machine precision reached
+          res[k] = rho;
+        if (res[k] < -rho) //machine precision reached
+          res[k] = -rho;
+        projection_rhs[k] = res[k];
+
+        rho *= std::sin( std::acos(projection_rhs[k] / rho) );
+
+        if (std::fabs(rho * rho_0 / norm_rhs) < tag.tolerance())  // Residual is sufficiently reduced, stop here
+        {
+          tag.error( std::fabs(rho*rho_0 / norm_rhs) );
+          ++k;
+          break;
+        }
+      } // for k
+
+      //
+      // Triangular solver stage:
+      //
+
+      for (int i2=static_cast<int>(k)-1; i2>-1; --i2)
+      {
+        vcl_size_t i = static_cast<vcl_size_t>(i2);
+        for (vcl_size_t j=i+1; j<k; ++j)
+          projection_rhs[i] -= R[j][i] * projection_rhs[j];     //R is transposed
+
+        projection_rhs[i] /= R[i][i];
+      }
+
+      //
+      // Note: 'projection_rhs' now holds the solution (eta_1, ..., eta_k)
+      //
+
+      res *= projection_rhs[0];
+
+      if (k > 0)
+      {
+        for (unsigned int i = 0; i < k-1; ++i)
+          res[i] += projection_rhs[i+1];
+      }
+
+      //
+      // Form z inplace in 'res' by applying P_1 * ... * P_{k}
+      //
+      for (int i=static_cast<int>(k)-1; i>=0; --i)
+        detail::gmres_householder_reflect(res, householder_reflectors[vcl_size_t(i)], betas[vcl_size_t(i)]);
+
+      res *= rho_0;
+      result += res;  // x += rho_0 * z    in the paper
+
+      //
+      // Check for convergence:
+      //
+      tag.error(std::fabs(rho*rho_0 / norm_rhs));
+
+      if (monitor && monitor(result, std::fabs(rho*rho_0 / norm_rhs), monitor_data))
+        break;
+
+      if ( tag.error() < tag.tolerance() )
+        return result;
+    }
+
+    return result;
+  }
+
+}
+
+template<typename MatrixT, typename VectorT, typename PreconditionerT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, gmres_tag const & tag, PreconditionerT const & precond)
+{
+  return detail::solve_impl(matrix, rhs, tag, precond);
+}
+
+/** @brief Convenience overload for calling the preconditioned BiCGStab solver using types from the C++ STL.
+  *
+  * A std::vector<std::map<T, U> > matrix is convenient for e.g. finite element assembly.
+  * It is not the fastest option for setting up a system, but often it is fast enough - particularly for just trying things out.
+  */
+template<typename IndexT, typename NumericT, typename PreconditionerT>
+std::vector<NumericT> solve(std::vector< std::map<IndexT, NumericT> > const & A, std::vector<NumericT> const & rhs, gmres_tag const & tag, PreconditionerT const & precond)
+{
+  viennacl::compressed_matrix<NumericT> vcl_A;
+  viennacl::copy(A, vcl_A);
+
+  viennacl::vector<NumericT> vcl_rhs(rhs.size());
+  viennacl::copy(rhs, vcl_rhs);
+
+  viennacl::vector<NumericT> vcl_result = solve(vcl_A, vcl_rhs, tag, precond);
+
+  std::vector<NumericT> result(vcl_result.size());
+  viennacl::copy(vcl_result, result);
+  return result;
+}
+
+/** @brief Entry point for the unpreconditioned GMRES method.
+ *
+ *  @param A         The system matrix
+ *  @param rhs       Right hand side vector (load vector)
+ *  @param tag       A BiCGStab tag providing relative tolerances, etc.
+ */
+
+template<typename MatrixT, typename VectorT>
+VectorT solve(MatrixT const & A, VectorT const & rhs, gmres_tag const & tag)
+{
+  return solve(A, rhs, tag, no_precond());
+}
+
+
+
+template<typename VectorT>
+class gmres_solver
+{
+public:
+  typedef typename viennacl::result_of::cpu_value_type<VectorT>::type   numeric_type;
+
+  gmres_solver(gmres_tag const & tag) : tag_(tag), monitor_callback_(NULL), user_data_(NULL) {}
+
+  template<typename MatrixT, typename PreconditionerT>
+  VectorT operator()(MatrixT const & A, VectorT const & b, PreconditionerT const & precond) const
+  {
+    if (viennacl::traits::size(init_guess_) > 0) // take initial guess into account
+    {
+      VectorT mod_rhs = viennacl::linalg::prod(A, init_guess_);
+      mod_rhs = b - mod_rhs;
+      VectorT y = detail::solve_impl(A, mod_rhs, tag_, precond, monitor_callback_, user_data_);
+      return init_guess_ + y;
+    }
+    return detail::solve_impl(A, b, tag_, precond, monitor_callback_, user_data_);
+  }
+
+
+  template<typename MatrixT>
+  VectorT operator()(MatrixT const & A, VectorT const & b) const
+  {
+    return operator()(A, b, viennacl::linalg::no_precond());
+  }
+
+  /** @brief Specifies an initial guess for the iterative solver.
+    *
+    * An iterative solver for Ax = b with initial guess x_0 is equivalent to an iterative solver for Ay = b' := b - Ax_0, where x = x_0 + y.
+    */
+  void set_initial_guess(VectorT const & x) { init_guess_ = x; }
+
+  /** @brief Sets a monitor function pointer to be called in each iteration. Set to NULL to run without monitor.
+   *
+   *  The monitor function is called with the current guess for the result as first argument and the current relative residual estimate as second argument.
+   *  The third argument is a pointer to user-defined data, through which additional information can be passed.
+   *  This pointer needs to be set with set_monitor_data. If not set, NULL is passed.
+   *  If the montior function returns true, the solver terminates (either convergence or divergence).
+   */
+  void set_monitor(bool (*monitor_fun)(VectorT const &, numeric_type, void *), void *user_data)
+  {
+    monitor_callback_ = monitor_fun;
+    user_data_ = user_data;
+  }
+
+  /** @brief Returns the solver tag containing basic configuration such as tolerances, etc. */
+  gmres_tag const & tag() const { return tag_; }
+
+private:
+  gmres_tag  tag_;
+  VectorT    init_guess_;
+  bool       (*monitor_callback_)(VectorT const &, numeric_type, void *);
+  void       *user_data_;
+};
+
+
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp
new file mode 100644
index 0000000..43ca928
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp
@@ -0,0 +1,66 @@
+#ifndef VIENNACL_LINALG_HANKEL_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HANKEL_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/hankel_matrix_operations.hpp
+    @brief Implementations of operations using hankel_matrix. Experimental.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/fft.hpp"
+#include "viennacl/linalg/toeplitz_matrix_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+// A * x
+
+/** @brief Carries out matrix-vector multiplication with a hankel_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param A      The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::hankel_matrix<NumericT, AlignmentV> const & A,
+               viennacl::vector_base<NumericT> const & vec,
+               viennacl::vector_base<NumericT>       & result)
+{
+  assert(A.size1() == result.size() && bool("Dimension mismatch"));
+  assert(A.size2() == vec.size()    && bool("Dimension mismatch"));
+
+  prod_impl(A.elements(), vec, result);
+  viennacl::linalg::reverse(result);
+}
+
+} //namespace linalg
+
+
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp
new file mode 100644
index 0000000..78bd150
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp
@@ -0,0 +1,1123 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_AMG_OPERATIONS_HPP
+#define VIENNACL_LINALG_HOST_BASED_AMG_OPERATIONS_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file host_based/amg_operations.hpp
+    @brief Implementations of routines for AMG using the CPU on the host (with OpenMP if enabled).
+*/
+
+#include <cstdlib>
+#include <cmath>
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+
+#include <map>
+#include <set>
+#include <functional>
+#ifdef VIENNACL_WITH_OPENMP
+#include <omp.h>
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+namespace amg
+{
+
+
+///////////////////////////////////////////
+
+/** @brief Routine for taking all connections in the matrix as strong */
+template<typename NumericT>
+void amg_influence_trivial(compressed_matrix<NumericT> const & A,
+                           viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                           viennacl::linalg::amg_tag & tag)
+{
+  (void)tag;
+
+  unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+  unsigned int *influences_id_ptr  = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+  unsigned int *influences_values_ptr  = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_values_.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    influences_row_ptr[i] = A_row_buffer[i];
+    influences_values_ptr[i] = A_row_buffer[i+1] - A_row_buffer[i];
+  }
+  influences_row_ptr[A.size1()] = A_row_buffer[A.size1()];
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i=0; i<long(A.nnz()); ++i)
+    influences_id_ptr[i] = A_col_buffer[i];
+}
+
+
+/** @brief Routine for extracting strongly connected points considering a user-provided threshold value */
+template<typename NumericT>
+void amg_influence_advanced(compressed_matrix<NumericT> const & A,
+                            viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                            viennacl::linalg::amg_tag & tag)
+{
+  NumericT     const * A_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+  unsigned int *influences_id_ptr  = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+
+  //
+  // Step 1: Scan influences in order to allocate the necessary memory
+  //
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    unsigned int row_start = A_row_buffer[i];
+    unsigned int row_stop  = A_row_buffer[i+1];
+    NumericT diag = 0;
+    NumericT largest_positive = 0;
+    NumericT largest_negative = 0;
+    unsigned int num_influences = 0;
+
+    // obtain diagonal element as well as maximum positive and negative off-diagonal entries
+    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+    {
+      unsigned int col = A_col_buffer[nnz_index];
+      NumericT value   = A_elements[nnz_index];
+
+      if (col == i)
+        diag = value;
+      else if (value > largest_positive)
+        largest_positive = value;
+      else if (value < largest_negative)
+        largest_negative = value;
+    }
+
+    if (largest_positive <= 0 && largest_negative >= 0) // no offdiagonal entries
+    {
+      influences_row_ptr[i] = 0;
+      continue;
+    }
+
+    // Find all points that strongly influence current point (Yang, p.5)
+    //std::cout << "Looking for strongly influencing points for point " << i << std::endl;
+    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+    {
+      unsigned int col = A_col_buffer[nnz_index];
+
+      if (i == col)
+        continue;
+
+      NumericT value   = A_elements[nnz_index];
+
+      if (   (diag > 0 && diag * value <= tag.get_strong_connection_threshold() * diag * largest_negative)
+          || (diag < 0 && diag * value <= tag.get_strong_connection_threshold() * diag * largest_positive))
+      {
+        ++num_influences;
+      }
+    }
+
+    influences_row_ptr[i] = num_influences;
+  }
+
+  //
+  // Step 2: Exclusive scan on number of influences to obtain CSR-like datastructure
+  //
+  unsigned int current_entry = 0;
+  for (std::size_t i=0; i<A.size1(); ++i)
+  {
+    unsigned int tmp = influences_row_ptr[i];
+    influences_row_ptr[i] = current_entry;
+    current_entry += tmp;
+  }
+  influences_row_ptr[A.size1()] = current_entry;
+
+
+  //
+  // Step 3: Write actual influences
+  //
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+  {
+    unsigned int i = static_cast<unsigned int>(i2);
+    unsigned int row_start = A_row_buffer[i];
+    unsigned int row_stop  = A_row_buffer[i+1];
+    NumericT diag = 0;
+    NumericT largest_positive = 0;
+    NumericT largest_negative = 0;
+
+    // obtain diagonal element as well as maximum positive and negative off-diagonal entries
+    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+    {
+      unsigned int col = A_col_buffer[nnz_index];
+      NumericT value   = A_elements[nnz_index];
+
+      if (col == i)
+        diag = value;
+      else if (value > largest_positive)
+        largest_positive = value;
+      else if (value < largest_negative)
+        largest_negative = value;
+    }
+
+    if (largest_positive <= 0 && largest_negative >= 0) // no offdiagonal entries
+      continue;
+
+    // Find all points that strongly influence current point (Yang, p.5)
+    //std::cout << "Looking for strongly influencing points for point " << i << std::endl;
+    unsigned int *influences_id_write_ptr = influences_id_ptr + influences_row_ptr[i];
+    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+    {
+      unsigned int col = A_col_buffer[nnz_index];
+
+      if (i == col)
+        continue;
+
+      NumericT value   = A_elements[nnz_index];
+
+      if (   (diag > 0 && diag * value <= tag.get_strong_connection_threshold() * diag * largest_negative)
+          || (diag < 0 && diag * value <= tag.get_strong_connection_threshold() * diag * largest_positive))
+      {
+        //std::cout << " - Adding influence from point " << col << std::endl;
+        *influences_id_write_ptr = col;
+        ++influences_id_write_ptr;
+      }
+    }
+  }
+
+}
+
+
+/** @brief Dispatcher for influence processing */
+template<typename NumericT>
+void amg_influence(compressed_matrix<NumericT> const & A,
+                   viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                   viennacl::linalg::amg_tag & tag)
+{
+  // TODO: dispatch based on influence tolerance provided
+  amg_influence_trivial(A, amg_context, tag);
+}
+
+
+
+/** @brief Assign IDs to coarse points */
+inline void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & amg_context)
+{
+  unsigned int *point_types_ptr  = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle());
+  unsigned int *coarse_id_ptr    = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.coarse_id_.handle());
+
+  unsigned int coarse_id = 0;
+  for (vcl_size_t i=0; i<amg_context.coarse_id_.size(); ++i)
+  {
+    //assert(point_types_ptr[i] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED && bool("Logic error in enumerate_coarse_points(): Undecided points detected!"));
+
+    if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+      coarse_id_ptr[i] = coarse_id++;
+  }
+
+  //std::cout << "Coarse nodes after enumerate_coarse_points(): " << coarse_id << std::endl;
+  amg_context.num_coarse_ = coarse_id;
+}
+
+
+
+
+//////////////////////////////////////
+
+
+/** @brief Helper struct for sequential classical one-pass coarsening */
+struct amg_id_influence
+{
+  amg_id_influence(std::size_t id2, std::size_t influences2) : id(static_cast<unsigned int>(id2)), influences(static_cast<unsigned int>(influences2)) {}
+
+  unsigned int  id;
+  unsigned int  influences;
+};
+
+inline bool operator>(amg_id_influence const & a, amg_id_influence const & b)
+{
+  if (a.influences > b.influences)
+    return true;
+  if (a.influences == b.influences)
+    return a.id > b.id;
+  return false;
+}
+
+/** @brief Classical (RS) one-pass coarsening. Single-Threaded! (VIENNACL_AMG_COARSE_CLASSIC_ONEPASS)
+*
+* @param A             Operator matrix for the respective level
+* @param amg_context   AMG datastructure object for the grid hierarchy
+* @param tag           AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_classic_onepass(compressed_matrix<NumericT> const & A,
+                                viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                                viennacl::linalg::amg_tag & tag)
+{
+  unsigned int *point_types_ptr       = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle());
+  unsigned int *influences_row_ptr    = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+  unsigned int *influences_id_ptr     = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+  unsigned int *influences_values_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_values_.handle());
+
+  std::set<amg_id_influence, std::greater<amg_id_influence> > points_by_influences;
+
+  amg_influence_advanced(A, amg_context, tag);
+
+  for (std::size_t i=0; i<A.size1(); ++i)
+    points_by_influences.insert(amg_id_influence(i, influences_values_ptr[i]));
+
+  //std::cout << "Starting coarsening process..." << std::endl;
+
+  while (!points_by_influences.empty())
+  {
+    amg_id_influence point = *(points_by_influences.begin());
+
+    // remove point from queue:
+    points_by_influences.erase(points_by_influences.begin());
+
+    //std::cout << "Working on point " << point.id << std::endl;
+
+    // point is already coarse or fine point, continue;
+    if (point_types_ptr[point.id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+      continue;
+
+    //std::cout << " Setting point " << point.id << " to a coarse point." << std::endl;
+    // make this a coarse point:
+    point_types_ptr[point.id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE;
+
+    // Set strongly influenced points to fine points:
+    unsigned int j_stop = influences_row_ptr[point.id + 1];
+    for (unsigned int j = influences_row_ptr[point.id]; j < j_stop; ++j)
+    {
+      unsigned int influenced_point_id = influences_id_ptr[j];
+
+      //std::cout << "Checking point " << influenced_point_id << std::endl;
+      if (point_types_ptr[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+        continue;
+
+      //std::cout << " Setting point " << influenced_point_id << " to a fine point." << std::endl;
+      point_types_ptr[influenced_point_id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+
+      // add one to influence measure for all undecided points strongly influencing this fine point.
+      unsigned int k_stop = influences_row_ptr[influenced_point_id + 1];
+      for (unsigned int k = influences_row_ptr[influenced_point_id]; k < k_stop; ++k)
+      {
+        unsigned int influenced_influenced_point_id = influences_id_ptr[k];
+        if (point_types_ptr[influenced_influenced_point_id] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+        {
+          // grab and remove from set, increase influence counter, store back:
+          amg_id_influence point_to_find(influenced_influenced_point_id, influences_values_ptr[influenced_influenced_point_id]);
+          points_by_influences.erase(point_to_find);
+
+          point_to_find.influences += 1;
+          influences_values_ptr[influenced_influenced_point_id] += 1; // for consistency
+
+          points_by_influences.insert(point_to_find);
+        }
+      } //for
+    } // for
+
+  } // while
+
+  viennacl::linalg::host_based::amg::enumerate_coarse_points(amg_context);
+}
+
+
+//////////////////////////
+
+
+/** @brief AG (aggregation based) coarsening, single-threaded version of stage 1
+*
+* @param A             Operator matrix for the respective level
+* @param amg_context   AMG datastructure object for the grid hierarchy
+* @param tag           AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag_stage1_sequential(compressed_matrix<NumericT> const & A,
+                                     viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                                     viennacl::linalg::amg_tag & tag)
+{
+  (void)tag;
+  unsigned int *point_types_ptr       = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle());
+  unsigned int *influences_row_ptr    = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+  unsigned int *influences_id_ptr     = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+
+  for (unsigned int i=0; i<static_cast<unsigned int>(A.size1()); ++i)
+  {
+    // check if node has no aggregates next to it (MIS-2)
+    bool is_new_coarse_node = true;
+
+    // Set strongly influenced points to fine points:
+    unsigned int j_stop = influences_row_ptr[i + 1];
+    for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
+    {
+      unsigned int influenced_point_id = influences_id_ptr[j];
+      if (point_types_ptr[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // either coarse or fine point
+      {
+        is_new_coarse_node = false;
+        break;
+      }
+    }
+
+    if (is_new_coarse_node)
+    {
+      // make all strongly influenced neighbors fine points:
+      for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
+      {
+        unsigned int influenced_point_id = influences_id_ptr[j];
+        point_types_ptr[influenced_point_id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+      }
+
+      //std::cout << "Setting new coarse node: " << i << std::endl;
+      // Note: influences may include diagonal element, so it's important to *first* set fine points above before setting the coarse information here
+      point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE;
+    }
+  }
+}
+
+
+
+/** @brief AG (aggregation based) coarsening, multi-threaded version of stage 1 using parallel maximum independent sets
+*
+* @param A             Operator matrix for the respective level
+* @param amg_context   AMG datastructure object for the grid hierarchy
+* @param tag           AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A,
+                               viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                               viennacl::linalg::amg_tag & tag)
+{
+  (void)tag;
+  unsigned int  *point_types_ptr       = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle());
+  unsigned int *influences_row_ptr    = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+  unsigned int *influences_id_ptr     = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+
+  std::vector<unsigned int> random_weights(A.size1());
+  for (std::size_t i=0; i<random_weights.size(); ++i)
+    random_weights[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.size1());
+
+  std::size_t num_threads = 1;
+#ifdef VIENNACL_WITH_OPENMP
+  num_threads = omp_get_max_threads();
+#endif
+
+  viennacl::vector<unsigned int> work_state(A.size1(), viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_random(A.size1(), viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_index(A.size1(), viennacl::traits::context(A));
+
+  unsigned int *work_state_ptr     = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_state.handle());
+  unsigned int *work_random_ptr    = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_random.handle());
+  unsigned int *work_index_ptr     = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_index.handle());
+
+  viennacl::vector<unsigned int> work_state2(A.size1(), viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_random2(A.size1(), viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_index2(A.size1(), viennacl::traits::context(A));
+
+  unsigned int *work_state2_ptr     = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_state2.handle());
+  unsigned int *work_random2_ptr    = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_random2.handle());
+  unsigned int *work_index2_ptr     = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_index2.handle());
+
+
+  unsigned int num_undecided = static_cast<unsigned int>(A.size1());
+  unsigned int pmis_iters = 0;
+  while (num_undecided > 0)
+  {
+    ++pmis_iters;
+
+    //
+    // init temporary work data:
+    //
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+    {
+      unsigned int i = static_cast<unsigned int>(i2);
+      switch (point_types_ptr[i])
+      {
+      case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED: work_state_ptr[i] = 1; break;
+      case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE:      work_state_ptr[i] = 0; break;
+      case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE:    work_state_ptr[i] = 2; break;
+      default:
+        throw std::runtime_error("Unexpected state encountered in MIS2 setup for AMG.");
+      }
+
+      work_random_ptr[i] = random_weights[i];
+      work_index_ptr[i]  = i;
+    }
+
+
+    //
+    // Propagate maximum tuple twice
+    //
+    for (unsigned int r = 0; r < 2; ++r)
+    {
+      // max operation
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for
+#endif
+      for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+      {
+        unsigned int i = static_cast<unsigned int>(i2);
+        // load
+        unsigned int state  = work_state_ptr[i];
+        unsigned int random = work_random_ptr[i];
+        unsigned int index  = work_index_ptr[i];
+
+        // max
+        unsigned int j_stop = influences_row_ptr[i + 1];
+        for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
+        {
+          unsigned int influenced_point_id = influences_id_ptr[j];
+
+          // lexigraphical triple-max (not particularly pretty, but does the job):
+          if (state < work_state_ptr[influenced_point_id])
+          {
+            state  = work_state_ptr[influenced_point_id];
+            random = work_random_ptr[influenced_point_id];
+            index  = work_index_ptr[influenced_point_id];
+          }
+          else if (state == work_state_ptr[influenced_point_id])
+          {
+            if (random < work_random_ptr[influenced_point_id])
+            {
+              state  = work_state_ptr[influenced_point_id];
+              random = work_random_ptr[influenced_point_id];
+              index  = work_index_ptr[influenced_point_id];
+            }
+            else if (random == work_random_ptr[influenced_point_id])
+            {
+              if (index < work_index_ptr[influenced_point_id])
+              {
+                state  = work_state_ptr[influenced_point_id];
+                random = work_random_ptr[influenced_point_id];
+                index  = work_index_ptr[influenced_point_id];
+              }
+            } // max(random)
+          } // max(state)
+        } // for
+
+        // store
+        work_state2_ptr[i]  = state;
+        work_random2_ptr[i] = random;
+        work_index2_ptr[i]  = index;
+      }
+
+      // copy work array
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for
+#endif
+      for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+      {
+        unsigned int i = static_cast<unsigned int>(i2);
+        work_state_ptr[i]  = work_state2_ptr[i];
+        work_random_ptr[i] = work_random2_ptr[i];
+        work_index_ptr[i]  = work_index2_ptr[i];
+      }
+    }
+
+    //
+    // mark MIS and non-MIS nodes:
+    //
+    std::vector<unsigned int> thread_buffer(num_threads);
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+    {
+      unsigned int i = static_cast<unsigned int>(i2);
+      unsigned int max_state  = work_state_ptr[i];
+      unsigned int max_index  = work_index_ptr[i];
+
+      if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+      {
+        if (i == max_index) // make this a MIS node
+          point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE;
+        else if (max_state == 2) // mind the mapping of viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE above!
+          point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+        else
+#ifdef VIENNACL_WITH_OPENMP
+          thread_buffer[omp_get_thread_num()] += 1;
+#else
+          thread_buffer[0] += 1;
+#endif
+      }
+    }
+
+    num_undecided = 0;
+    for (std::size_t i=0; i<thread_buffer.size(); ++i)
+      num_undecided += thread_buffer[i];
+  } // while
+
+  // consistency with sequential MIS: reset state for non-coarse points, so that coarse indices are correctly picked up later
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i=0; i<static_cast<long>(A.size1()); ++i)
+    if (point_types_ptr[i] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+      point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED;
+
+}
+
+
+
+/** @brief AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG)
+*
+* @param A             Operator matrix for the respective level
+* @param amg_context   AMG datastructure object for the grid hierarchy
+* @param tag           AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag(compressed_matrix<NumericT> const & A,
+                   viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                   viennacl::linalg::amg_tag & tag)
+{
+  unsigned int *point_types_ptr       = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle());
+  unsigned int *influences_row_ptr    = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+  unsigned int *influences_id_ptr     = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+  unsigned int *coarse_id_ptr         = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.coarse_id_.handle());
+
+  amg_influence_trivial(A, amg_context, tag);
+
+  //
+  // Stage 1: Build aggregates:
+  //
+  if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_AGGREGATION)      amg_coarse_ag_stage1_sequential(A, amg_context, tag);
+  if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION) amg_coarse_ag_stage1_mis2(A, amg_context, tag);
+
+  viennacl::linalg::host_based::amg::enumerate_coarse_points(amg_context);
+
+  //
+  // Stage 2: Propagate coarse aggregate indices to neighbors:
+  //
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+  {
+    unsigned int i = static_cast<unsigned int>(i2);
+    if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+    {
+      unsigned int coarse_index = coarse_id_ptr[i];
+
+      unsigned int j_stop = influences_row_ptr[i + 1];
+      for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
+      {
+        unsigned int influenced_point_id = influences_id_ptr[j];
+        coarse_id_ptr[influenced_point_id] = coarse_index; // Set aggregate index for fine point
+
+        if (influenced_point_id != i) // Note: Any write races between threads are harmless here
+          point_types_ptr[influenced_point_id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+      }
+    }
+  }
+
+
+  //
+  // Stage 3: Merge remaining undecided points (merging to first aggregate found when cycling over the hierarchy
+  //
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+  {
+    unsigned int i = static_cast<unsigned int>(i2);
+    if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+    {
+      unsigned int j_stop = influences_row_ptr[i + 1];
+      for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
+      {
+        unsigned int influenced_point_id = influences_id_ptr[j];
+        if (point_types_ptr[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // either coarse or fine point
+        {
+          //std::cout << "Setting fine node " << i << " to be aggregated with node " << *influence_iter << "/" << pointvector.get_coarse_index(*influence_iter) << std::endl;
+          coarse_id_ptr[i] = coarse_id_ptr[influenced_point_id];
+          break;
+        }
+      }
+    }
+  }
+
+  //
+  // Stage 4: Set undecided points to fine points (coarse ID already set in Stage 3)
+  //          Note: Stage 3 and Stage 4 were initially fused, but are now split in order to avoid race conditions (or a fallback to sequential execution).
+  //
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i=0; i<static_cast<long>(A.size1()); ++i)
+    if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+      point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+
+}
+
+
+
+
+/** @brief Entry point and dispatcher for coarsening procedures
+*
+* @param A             Operator matrix for the respective level
+* @param amg_context   AMG datastructure object for the grid hierarchy
+* @param tag           AMG preconditioner tag
+*/
+template<typename MatrixT>
+void amg_coarse(MatrixT & A,
+                viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                viennacl::linalg::amg_tag & tag)
+{
+  switch (tag.get_coarsening_method())
+  {
+  case viennacl::linalg::AMG_COARSENING_METHOD_ONEPASS: amg_coarse_classic_onepass(A, amg_context, tag); break;
+  case viennacl::linalg::AMG_COARSENING_METHOD_AGGREGATION:
+  case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: amg_coarse_ag(A, amg_context, tag); break;
+  //default: throw std::runtime_error("not implemented yet");
+  }
+}
+
+
+
+
+////////////////////////////////////// Interpolation /////////////////////////////
+
+
+/** @brief Direct interpolation. Multi-threaded! (VIENNACL_AMG_INTERPOL_DIRECT)
+ *
+ * @param A            Operator matrix
+ * @param P            Prolongation matrix
+ * @param amg_context  AMG hierarchy datastructures
+ * @param tag          AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_interpol_direct(compressed_matrix<NumericT> const & A,
+                         compressed_matrix<NumericT> & P,
+                         viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                         viennacl::linalg::amg_tag & tag)
+{
+  NumericT     const * A_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  unsigned int *point_types_ptr       = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle());
+  unsigned int *influences_row_ptr    = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+  unsigned int *influences_id_ptr     = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+  unsigned int *coarse_id_ptr         = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.coarse_id_.handle());
+
+  P.resize(A.size1(), amg_context.num_coarse_, false);
+
+  std::vector<std::map<unsigned int, NumericT> > P_setup(A.size1());
+
+  // Iterate over all points to build the interpolation matrix row-by-row
+  // Interpolation for coarse points is immediate using '1'.
+  // Interpolation for fine points is set up via corresponding row weights (cf. Yang paper, p. 14)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long row2=0; row2<static_cast<long>(A.size1()); ++row2)
+  {
+    unsigned int row = static_cast<unsigned int>(row2);
+    std::map<unsigned int, NumericT> & P_setup_row = P_setup[row];
+    //std::cout << "Row " << row << ": " << std::endl;
+    if (point_types_ptr[row] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+    {
+      //std::cout << "  Setting value 1.0 at " << coarse_id_ptr[row] << std::endl;
+      P_setup_row[coarse_id_ptr[row]] = NumericT(1);
+    }
+    else if (point_types_ptr[row] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE)
+    {
+      //std::cout << "Building interpolant for fine point " << row << std::endl;
+
+      NumericT row_sum = 0;
+      NumericT row_coarse_sum = 0;
+      NumericT diag = 0;
+
+      // Row sum of coefficients (without diagonal) and sum of influencing coarse point coefficients has to be computed
+      unsigned int row_A_start = A_row_buffer[row];
+      unsigned int row_A_end   = A_row_buffer[row + 1];
+      unsigned int const *influence_iter = influences_id_ptr + influences_row_ptr[row];
+      unsigned int const *influence_end  = influences_id_ptr + influences_row_ptr[row + 1];
+      for (unsigned int index = row_A_start; index < row_A_end; ++index)
+      {
+        unsigned int col = A_col_buffer[index];
+        NumericT value = A_elements[index];
+
+        if (col == row)
+        {
+          diag = value;
+          continue;
+        }
+        else if (point_types_ptr[col] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+        {
+          // Note: One increment is sufficient, because influence_iter traverses an ordered subset of the column indices in this row
+          while (influence_iter != influence_end && *influence_iter < col)
+            ++influence_iter;
+
+          if (influence_iter != influence_end && *influence_iter == col)
+            row_coarse_sum += value;
+        }
+
+        row_sum += value;
+      }
+
+      NumericT temp_res = -row_sum/(row_coarse_sum*diag);
+      //std::cout << "row_sum: " << row_sum << ", row_coarse_sum: " << row_coarse_sum << ", diag: " << diag << std::endl;
+
+      if (std::fabs(temp_res) > 1e-2 * std::fabs(diag))
+      {
+        // Iterate over all strongly influencing points to build the interpolant
+        influence_iter = influences_id_ptr + influences_row_ptr[row];
+        for (unsigned int index = row_A_start; index < row_A_end; ++index)
+        {
+          unsigned int col = A_col_buffer[index];
+          if (point_types_ptr[col] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+            continue;
+          NumericT value = A_elements[index];
+
+          // Advance to correct influence metric:
+          while (influence_iter != influence_end && *influence_iter < col)
+            ++influence_iter;
+
+          if (influence_iter != influence_end && *influence_iter == col)
+          {
+            //std::cout << " Setting entry "  << temp_res * value << " at " << coarse_id_ptr[col] << " for point " << col << std::endl;
+            P_setup_row[coarse_id_ptr[col]] = temp_res * value;
+          }
+        }
+      }
+
+      // TODO truncate interpolation if specified by the user.
+      (void)tag;
+    }
+    else
+      throw std::runtime_error("Logic error in direct interpolation: Point is neither coarse-point nor fine-point!");
+  }
+
+  // TODO: P_setup can be avoided without sacrificing parallelism.
+  viennacl::tools::sparse_matrix_adapter<NumericT> P_adapter(P_setup, P.size1(), P.size2());
+  viennacl::copy(P_adapter, P);
+}
+
+
+/** @brief AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_AG)
+ *
+ * @param A            Operator matrix
+ * @param P            Prolongation matrix
+ * @param amg_context  AMG hierarchy datastructures
+ * @param tag          AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_ag(compressed_matrix<NumericT> const & A,
+                     compressed_matrix<NumericT> & P,
+                     viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                     viennacl::linalg::amg_tag & tag)
+{
+  (void)tag;
+  P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+  NumericT     * P_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(P.handle());
+  unsigned int * P_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(P.handle1());
+  unsigned int * P_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(P.handle2());
+
+  unsigned int *coarse_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.coarse_id_.handle());
+
+  // Build interpolation matrix:
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long row2 = 0; row2 < long(A.size1()); ++row2)
+  {
+    unsigned int row = static_cast<unsigned int>(row2);
+    P_elements[row]   = NumericT(1);
+    P_row_buffer[row] = row;
+    P_col_buffer[row] = coarse_id_ptr[row];
+  }
+  P_row_buffer[A.size1()] = static_cast<unsigned int>(A.size1()); // don't forget finalizer
+
+  P.generate_row_block_information();
+}
+
+
+/** @brief Smoothed aggregation interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA)
+ *
+ * @param A            Operator matrix
+ * @param P            Prolongation matrix
+ * @param amg_context  AMG hierarchy datastructures
+ * @param tag          AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_sa(compressed_matrix<NumericT> const & A,
+                     compressed_matrix<NumericT> & P,
+                     viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                     viennacl::linalg::amg_tag & tag)
+{
+  (void)tag;
+  viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+  // form tentative operator:
+  amg_interpol_ag(A, P_tentative, amg_context, tag);
+
+  unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+  NumericT     const * A_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+
+  viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), viennacl::traits::context(A));
+  unsigned int * Jacobi_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(Jacobi.handle1());
+  unsigned int * Jacobi_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(Jacobi.handle2());
+  NumericT     * Jacobi_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(Jacobi.handle());
+
+
+  // Build Jacobi matrix:
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long row2=0; row2<static_cast<long>(A.size1()); ++row2)
+  {
+    unsigned int row = static_cast<unsigned int>(row2);
+    unsigned int row_begin = A_row_buffer[row];
+    unsigned int row_end   = A_row_buffer[row+1];
+
+    Jacobi_row_buffer[row] = row_begin;
+
+    // Step 1: Extract diagonal:
+    NumericT diag = 0;
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      if (A_col_buffer[j] == row)
+      {
+        diag = A_elements[j];
+        break;
+      }
+    }
+
+    // Step 2: Write entries:
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      unsigned int col_index = A_col_buffer[j];
+      Jacobi_col_buffer[j] = col_index;
+
+      if (col_index == row)
+        Jacobi_elements[j] = NumericT(1) - NumericT(tag.get_jacobi_weight());
+      else
+        Jacobi_elements[j] = - NumericT(tag.get_jacobi_weight()) * A_elements[j] / diag;
+    }
+  }
+  Jacobi_row_buffer[A.size1()] = static_cast<unsigned int>(Jacobi.nnz()); // don't forget finalizer
+
+  P = viennacl::linalg::prod(Jacobi, P_tentative);
+
+  P.generate_row_block_information();
+}
+
+
+/** @brief Dispatcher for building the interpolation matrix
+ *
+ * @param A            Operator matrix
+ * @param P            Prolongation matrix
+ * @param amg_context  AMG hierarchy datastructures
+ * @param tag          AMG configuration tag
+*/
+template<typename MatrixT>
+void amg_interpol(MatrixT const & A,
+                  MatrixT & P,
+                  viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                  viennacl::linalg::amg_tag & tag)
+{
+  switch (tag.get_interpolation_method())
+  {
+  case viennacl::linalg::AMG_INTERPOLATION_METHOD_DIRECT:               amg_interpol_direct (A, P, amg_context, tag); break;
+  case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION:          amg_interpol_ag     (A, P, amg_context, tag); break;
+  case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION: amg_interpol_sa     (A, P, amg_context, tag); break;
+  default: throw std::runtime_error("Not implemented yet!");
+  }
+}
+
+
+/** @brief Computes B = trans(A).
+  *
+  * To be replaced by native functionality in ViennaCL.
+  */
+template<typename NumericT>
+void amg_transpose(compressed_matrix<NumericT> const & A,
+                   compressed_matrix<NumericT> & B)
+{
+  NumericT     const * A_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  // initialize datastructures for B:
+  B = compressed_matrix<NumericT>(A.size2(), A.size1(), A.nnz(), viennacl::traits::context(A));
+
+  NumericT     * B_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(B.handle());
+  unsigned int * B_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle1());
+  unsigned int * B_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle2());
+
+  // prepare uninitialized B_row_buffer:
+  for (std::size_t i = 0; i < B.size1(); ++i)
+    B_row_buffer[i] = 0;
+
+  //
+  // Stage 1: Compute pattern for B
+  //
+  for (std::size_t row = 0; row < A.size1(); ++row)
+  {
+    unsigned int row_start = A_row_buffer[row];
+    unsigned int row_stop  = A_row_buffer[row+1];
+
+    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+      B_row_buffer[A_col_buffer[nnz_index]] += 1;
+  }
+
+  // Bring row-start array in place using inclusive-scan:
+  unsigned int offset = B_row_buffer[0];
+  B_row_buffer[0] = 0;
+  for (std::size_t row = 1; row < B.size1(); ++row)
+  {
+    unsigned int tmp = B_row_buffer[row];
+    B_row_buffer[row] = offset;
+    offset += tmp;
+  }
+  B_row_buffer[B.size1()] = offset;
+
+  //
+  // Stage 2: Fill with data
+  //
+
+  std::vector<unsigned int> B_row_offsets(B.size1()); //number of elements already written per row
+
+  for (std::size_t row = 0; row < A.size1(); ++row)
+  {
+    //std::cout << "Row " << row << ": ";
+    unsigned int row_start = A_row_buffer[row];
+    unsigned int row_stop  = A_row_buffer[row+1];
+
+    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+    {
+      unsigned int col_in_A = A_col_buffer[nnz_index];
+      unsigned int B_nnz_index = B_row_buffer[col_in_A] + B_row_offsets[col_in_A];
+      B_col_buffer[B_nnz_index] = static_cast<unsigned int>(row);
+      B_elements[B_nnz_index] = A_elements[nnz_index];
+      ++B_row_offsets[col_in_A];
+      //B_temp.at(A_col_buffer[nnz_index])[row] = A_elements[nnz_index];
+    }
+  }
+
+  // Step 3: Make datastructure consistent (row blocks!)
+  B.generate_row_block_information();
+}
+
+/** Assign sparse matrix A to dense matrix B */
+template<typename NumericT, unsigned int AlignmentV>
+void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+                     viennacl::matrix_base<NumericT> & B)
+{
+  NumericT     const * A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  NumericT           * B_elements   = detail::extract_raw_pointer<NumericT>(B.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int row_stop  = A_row_buffer[row+1];
+
+    for (unsigned int nnz_index = A_row_buffer[row]; nnz_index < row_stop; ++nnz_index)
+      B_elements[static_cast<unsigned int>(row) * static_cast<unsigned int>(B.internal_size2()) + A_col_buffer[nnz_index]] = A_elements[nnz_index];
+  }
+
+}
+
+/** @brief Damped Jacobi Smoother (CUDA version)
+*
+* @param iterations  Number of smoother iterations
+* @param A           Operator matrix for the smoothing
+* @param x           The vector smoothing is applied to
+* @param x_backup    (Different) Vector holding the same values as x
+* @param rhs_smooth  The right hand side of the equation for the smoother
+* @param weight      Damping factor. 0: No effect of smoother. 1: Undamped Jacobi iteration
+*/
+template<typename NumericT>
+void smooth_jacobi(unsigned int iterations,
+                   compressed_matrix<NumericT> const & A,
+                   vector<NumericT> & x,
+                   vector<NumericT> & x_backup,
+                   vector<NumericT> const & rhs_smooth,
+                   NumericT weight)
+{
+
+  NumericT     const * A_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+  NumericT     const * rhs_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(rhs_smooth.handle());
+
+  NumericT           * x_elements     = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(x.handle());
+  NumericT     const * x_old_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(x_backup.handle());
+
+  for (unsigned int i=0; i<iterations; ++i)
+  {
+    x_backup = x;
+
+    #ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+    #endif
+    for (long row2 = 0; row2 < static_cast<long>(A.size1()); ++row2)
+    {
+      unsigned int row = static_cast<unsigned int>(row2);
+      unsigned int col_end   = A_row_buffer[row+1];
+
+      NumericT sum  = NumericT(0);
+      NumericT diag = NumericT(1);
+      for (unsigned int index = A_row_buffer[row]; index != col_end; ++index)
+      {
+        unsigned int col = A_col_buffer[index];
+        if (col == row)
+          diag = A_elements[index];
+        else
+          sum += A_elements[index] * x_old_elements[col];
+      }
+
+      x_elements[row] = weight * (rhs_elements[row] - sum) / diag + (NumericT(1) - weight) * x_old_elements[row];
+    }
+  }
+}
+
+} //namespace amg
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp
new file mode 100644
index 0000000..8ddb4c1
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp
@@ -0,0 +1,149 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_COMMON_HPP_
+#define VIENNACL_LINALG_HOST_BASED_COMMON_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/common.hpp
+    @brief Common routines for single-threaded or OpenMP-enabled execution on CPU
+*/
+
+#include "viennacl/traits/handle.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+namespace detail
+{
+
+template<typename ResultT, typename VectorT>
+ResultT * extract_raw_pointer(VectorT & vec)
+{
+  return reinterpret_cast<ResultT *>(viennacl::traits::ram_handle(vec).get());
+}
+
+template<typename ResultT, typename VectorT>
+ResultT const * extract_raw_pointer(VectorT const & vec)
+{
+  return reinterpret_cast<ResultT const *>(viennacl::traits::ram_handle(vec).get());
+}
+
+/** @brief Helper class for accessing a strided subvector of a larger vector. */
+template<typename NumericT>
+class vector_array_wrapper
+{
+public:
+  typedef NumericT   value_type;
+
+  vector_array_wrapper(value_type * A,
+                       vcl_size_t start,
+                       vcl_size_t inc)
+   : A_(A),
+     start_(start),
+     inc_(inc) {}
+
+  value_type & operator()(vcl_size_t i) { return A_[i * inc_ + start_]; }
+
+private:
+  value_type * A_;
+  vcl_size_t start_;
+  vcl_size_t inc_;
+};
+
+
+/** @brief Helper array for accessing a strided submatrix embedded in a larger matrix. */
+template<typename NumericT, typename LayoutT, bool is_transposed>
+class matrix_array_wrapper
+{
+  public:
+    typedef NumericT   value_type;
+
+    matrix_array_wrapper(value_type * A,
+                         vcl_size_t start1, vcl_size_t start2,
+                         vcl_size_t inc1,   vcl_size_t inc2,
+                         vcl_size_t internal_size1, vcl_size_t internal_size2)
+     : A_(A),
+       start1_(start1), start2_(start2),
+       inc1_(inc1), inc2_(inc2),
+       internal_size1_(internal_size1), internal_size2_(internal_size2) {}
+
+    value_type & operator()(vcl_size_t i, vcl_size_t j)
+    {
+      return A_[LayoutT::mem_index(i * inc1_ + start1_,
+                                   j * inc2_ + start2_,
+                                   internal_size1_, internal_size2_)];
+    }
+
+    // convenience overloads to address signed index types for OpenMP:
+    value_type & operator()(vcl_size_t i, long j) { return operator()(i, static_cast<vcl_size_t>(j)); }
+    value_type & operator()(long i, vcl_size_t j) { return operator()(static_cast<vcl_size_t>(i), j); }
+    value_type & operator()(long i, long j)       { return operator()(static_cast<vcl_size_t>(i), static_cast<vcl_size_t>(j)); }
+
+  private:
+    value_type * A_;
+    vcl_size_t start1_, start2_;
+    vcl_size_t inc1_, inc2_;
+    vcl_size_t internal_size1_, internal_size2_;
+};
+
+/** \cond */
+template<typename NumericT, typename LayoutT>
+class matrix_array_wrapper<NumericT, LayoutT, true>
+{
+public:
+  typedef NumericT   value_type;
+
+  matrix_array_wrapper(value_type * A,
+                       vcl_size_t start1, vcl_size_t start2,
+                       vcl_size_t inc1,   vcl_size_t inc2,
+                       vcl_size_t internal_size1, vcl_size_t internal_size2)
+   : A_(A),
+     start1_(start1), start2_(start2),
+     inc1_(inc1), inc2_(inc2),
+     internal_size1_(internal_size1), internal_size2_(internal_size2) {}
+
+  value_type & operator()(vcl_size_t i, vcl_size_t j)
+  {
+    //swapping row and column indices here
+    return A_[LayoutT::mem_index(j * inc1_ + start1_,
+                                 i * inc2_ + start2_,
+                                 internal_size1_, internal_size2_)];
+  }
+
+  // convenience overloads to address signed index types for OpenMP:
+  value_type & operator()(vcl_size_t i, long j) { return operator()(i, static_cast<vcl_size_t>(j)); }
+  value_type & operator()(long i, vcl_size_t j) { return operator()(static_cast<vcl_size_t>(i), j); }
+  value_type & operator()(long i, long j) { return operator()(static_cast<vcl_size_t>(i), static_cast<vcl_size_t>(j)); }
+
+private:
+  value_type * A_;
+  vcl_size_t start1_, start2_;
+  vcl_size_t inc1_, inc2_;
+  vcl_size_t internal_size1_, internal_size2_;
+};
+/** \endcond */
+
+} //namespace detail
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif


[49/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cu
new file mode 100644
index 0000000..617b128
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cu
@@ -0,0 +1,297 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiSamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                               ViennaCLInt *index,
+                                                               cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiDamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                               ViennaCLInt *index,
+                                                               cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLScopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDcopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             float *alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             double *alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             float c, float s)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             double c, double s)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cpp
new file mode 100644
index 0000000..bc2c095
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cpp
@@ -0,0 +1,231 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_vector.hpp"
+#include "init_matrix.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// GEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemv(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLVector x, ViennaCLHostScalar beta, ViennaCLVector y)
+{
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+  viennacl::backend::mem_handle A_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type           size_type;
+      typedef viennacl::vector_base<float>::size_type           difference_type;
+
+      viennacl::vector_base<float> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+      viennacl::matrix_base<float> mat(A_handle,
+                                       size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                       size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      v2 *= beta->value_float;
+      if (A->trans == ViennaCLTrans)
+        v2 += alpha->value_float * viennacl::linalg::prod(viennacl::trans(mat), v1);
+      else
+        v2 += alpha->value_float * viennacl::linalg::prod(mat, v1);
+
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type           size_type;
+      typedef viennacl::vector_base<double>::size_type           difference_type;
+
+      viennacl::vector_base<double> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+      viennacl::matrix_base<double> mat(A_handle,
+                                        size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                        size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      v2 *= beta->value_double;
+      if (A->trans == ViennaCLTrans)
+        v2 += alpha->value_double * viennacl::linalg::prod(viennacl::trans(mat), v1);
+      else
+        v2 += alpha->value_double * viennacl::linalg::prod(mat, v1);
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsv(ViennaCLMatrix A, ViennaCLVector x, ViennaCLUplo uplo)
+{
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle A_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type           size_type;
+      typedef viennacl::vector_base<float>::size_type           difference_type;
+
+      viennacl::vector_base<float> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+
+      viennacl::matrix_base<float> mat(A_handle,
+                                       size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                       size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      if (A->trans == ViennaCLTrans)
+      {
+        if (uplo == ViennaCLUpper)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+        else
+          viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+      }
+      else
+      {
+        if (uplo == ViennaCLUpper)
+          viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+        else
+          viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+      }
+
+      return ViennaCLSuccess;
+    }
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type           size_type;
+      typedef viennacl::vector_base<double>::size_type           difference_type;
+
+      viennacl::vector_base<double> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+
+      viennacl::matrix_base<double> mat(A_handle,
+                                        size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                        size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      if (A->trans == ViennaCLTrans)
+      {
+        if (uplo == ViennaCLUpper)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+        else
+          viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+      }
+      else
+      {
+        if (uplo == ViennaCLUpper)
+          viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+        else
+          viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+      }
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return  ViennaCLGenericFailure;
+  }
+}
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLger(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y, ViennaCLMatrix A)
+{
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+  viennacl::backend::mem_handle A_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type           size_type;
+      typedef viennacl::vector_base<float>::size_type           difference_type;
+
+      viennacl::vector_base<float> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+      viennacl::matrix_base<float> mat(A_handle,
+                                       size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                       size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+
+      mat += alpha->value_float * viennacl::linalg::outer_prod(v1, v2);
+
+      return ViennaCLSuccess;
+    }
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type           size_type;
+      typedef viennacl::vector_base<double>::size_type           difference_type;
+
+      viennacl::vector_base<double> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+      viennacl::matrix_base<double> mat(A_handle,
+                                        size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                        size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+
+      mat += alpha->value_double * viennacl::linalg::outer_prod(v1, v2);
+
+      return ViennaCLSuccess;
+    }
+    default:
+      return  ViennaCLGenericFailure;
+  }
+}
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cu
new file mode 100644
index 0000000..bc2c095
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cu
@@ -0,0 +1,231 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_vector.hpp"
+#include "init_matrix.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// GEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemv(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLVector x, ViennaCLHostScalar beta, ViennaCLVector y)
+{
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+  viennacl::backend::mem_handle A_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type           size_type;
+      typedef viennacl::vector_base<float>::size_type           difference_type;
+
+      viennacl::vector_base<float> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+      viennacl::matrix_base<float> mat(A_handle,
+                                       size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                       size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      v2 *= beta->value_float;
+      if (A->trans == ViennaCLTrans)
+        v2 += alpha->value_float * viennacl::linalg::prod(viennacl::trans(mat), v1);
+      else
+        v2 += alpha->value_float * viennacl::linalg::prod(mat, v1);
+
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type           size_type;
+      typedef viennacl::vector_base<double>::size_type           difference_type;
+
+      viennacl::vector_base<double> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+      viennacl::matrix_base<double> mat(A_handle,
+                                        size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                        size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      v2 *= beta->value_double;
+      if (A->trans == ViennaCLTrans)
+        v2 += alpha->value_double * viennacl::linalg::prod(viennacl::trans(mat), v1);
+      else
+        v2 += alpha->value_double * viennacl::linalg::prod(mat, v1);
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsv(ViennaCLMatrix A, ViennaCLVector x, ViennaCLUplo uplo)
+{
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle A_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type           size_type;
+      typedef viennacl::vector_base<float>::size_type           difference_type;
+
+      viennacl::vector_base<float> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+
+      viennacl::matrix_base<float> mat(A_handle,
+                                       size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                       size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      if (A->trans == ViennaCLTrans)
+      {
+        if (uplo == ViennaCLUpper)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+        else
+          viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+      }
+      else
+      {
+        if (uplo == ViennaCLUpper)
+          viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+        else
+          viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+      }
+
+      return ViennaCLSuccess;
+    }
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type           size_type;
+      typedef viennacl::vector_base<double>::size_type           difference_type;
+
+      viennacl::vector_base<double> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+
+      viennacl::matrix_base<double> mat(A_handle,
+                                        size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                        size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      if (A->trans == ViennaCLTrans)
+      {
+        if (uplo == ViennaCLUpper)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+        else
+          viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+      }
+      else
+      {
+        if (uplo == ViennaCLUpper)
+          viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+        else
+          viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+      }
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return  ViennaCLGenericFailure;
+  }
+}
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLger(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y, ViennaCLMatrix A)
+{
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+  viennacl::backend::mem_handle A_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type           size_type;
+      typedef viennacl::vector_base<float>::size_type           difference_type;
+
+      viennacl::vector_base<float> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+      viennacl::matrix_base<float> mat(A_handle,
+                                       size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                       size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+
+      mat += alpha->value_float * viennacl::linalg::outer_prod(v1, v2);
+
+      return ViennaCLSuccess;
+    }
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type           size_type;
+      typedef viennacl::vector_base<double>::size_type           difference_type;
+
+      viennacl::vector_base<double> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+      viennacl::matrix_base<double> mat(A_handle,
+                                        size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                        size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+
+      mat += alpha->value_double * viennacl::linalg::outer_prod(v1, v2);
+
+      return ViennaCLSuccess;
+    }
+    default:
+      return  ViennaCLGenericFailure;
+  }
+}
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2_cuda.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2_cuda.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_cuda.cu
new file mode 100644
index 0000000..60c9293
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_cuda.cu
@@ -0,0 +1,204 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+#ifdef VIENNACL_WITH_CUDA
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASgemv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float beta,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+  viennacl::matrix_base<float> mat(A, viennacl::CUDA_MEMORY,
+                                   m, offA_row, incA_row, m,
+                                   n, offA_col, incA_col, lda, order == ViennaCLRowMajor);
+  v2 *= beta;
+  if (transA == ViennaCLTrans)
+    v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+  else
+    v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADgemv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double beta,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+  viennacl::matrix_base<double> mat(A, viennacl::CUDA_MEMORY,
+                                    m, offA_row, incA_row, m,
+                                    n, offA_col, incA_col, lda, order == ViennaCLRowMajor);
+  v2 *= beta;
+  if (transA == ViennaCLTrans)
+    v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+  else
+    v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAStrsv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::matrix_base<float> mat(A, viennacl::CUDA_MEMORY,
+                                   n, offA_row, incA_row, n,
+                                   n, offA_col, incA_col, lda, order == ViennaCLRowMajor);
+  if (transA == ViennaCLTrans)
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+  }
+  else
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADtrsv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::matrix_base<double> mat(A, viennacl::CUDA_MEMORY,
+                                    n, offA_row, incA_row, n,
+                                    n, offA_col, incA_col, lda, order == ViennaCLRowMajor);
+  if (transA == ViennaCLTrans)
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+  }
+  else
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASger(ViennaCLBackend /*backend*/,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           float alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+  viennacl::matrix_base<float> mat(A, viennacl::CUDA_MEMORY,
+                                   m, offA_row, incA_row, m,
+                                   n, offA_col, incA_col, lda, order == ViennaCLRowMajor);
+
+  mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADger(ViennaCLBackend /*backend*/,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m,  ViennaCLInt n,
+                                                           double alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+  viennacl::matrix_base<double> mat(A, viennacl::CUDA_MEMORY,
+                                    m, offA_row, incA_row, m,
+                                    n, offA_col, incA_col, lda, order == ViennaCLRowMajor);
+
+  mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+  return ViennaCLSuccess;
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cpp
new file mode 100644
index 0000000..b1ef39a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cpp
@@ -0,0 +1,219 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float beta,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+  viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+                                   size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                   size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  v2 *= beta;
+  if (transA == ViennaCLTrans)
+    v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+  else
+    v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double beta,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+  viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+                                    size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                    size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  v2 *= beta;
+  if (transA == ViennaCLTrans)
+    v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+  else
+    v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostStrsv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+
+  viennacl::vector_base<float> v(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+                                   size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+                                   size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  if (transA == ViennaCLTrans)
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+  }
+  else
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDtrsv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+
+  viennacl::vector_base<double> v(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+                                    size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+                                    size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  if (transA == ViennaCLTrans)
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+  }
+  else
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSger(ViennaCLBackend /*backend*/,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           float alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+  viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+                                   size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                   size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+  mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDger(ViennaCLBackend /*backend*/,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           double alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+  viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+                                    size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                    size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+  mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+  return ViennaCLSuccess;
+}
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cu
new file mode 100644
index 0000000..b1ef39a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cu
@@ -0,0 +1,219 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float beta,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+  viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+                                   size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                   size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  v2 *= beta;
+  if (transA == ViennaCLTrans)
+    v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+  else
+    v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double beta,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+  viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+                                    size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                    size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  v2 *= beta;
+  if (transA == ViennaCLTrans)
+    v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+  else
+    v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostStrsv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+
+  viennacl::vector_base<float> v(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+                                   size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+                                   size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  if (transA == ViennaCLTrans)
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+  }
+  else
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDtrsv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+
+  viennacl::vector_base<double> v(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+                                    size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+                                    size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  if (transA == ViennaCLTrans)
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+  }
+  else
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSger(ViennaCLBackend /*backend*/,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           float alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+  viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+                                   size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                   size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+  mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDger(ViennaCLBackend /*backend*/,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           double alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+  viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+                                    size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                    size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+  mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+  return ViennaCLSuccess;
+}
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cpp
new file mode 100644
index 0000000..20c4994
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cpp
@@ -0,0 +1,219 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemv(ViennaCLBackend backend,
+                                                              ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt m, ViennaCLInt n, float alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              float beta,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                   size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                   size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  v2 *= beta;
+  if (transA == ViennaCLTrans)
+    v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+  else
+    v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemv(ViennaCLBackend backend,
+                                                              ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt m, ViennaCLInt n, double alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              double beta,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                    size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                    size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  v2 *= beta;
+  if (transA == ViennaCLTrans)
+    v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+  else
+    v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLStrsv(ViennaCLBackend backend,
+                                                              ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                              ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+
+  viennacl::vector_base<float> v(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                   size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+                                   size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  if (transA == ViennaCLTrans)
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+  }
+  else
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDtrsv(ViennaCLBackend backend,
+                                                              ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                              ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+
+  viennacl::vector_base<double> v(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                    size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+                                    size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  if (transA == ViennaCLTrans)
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+  }
+  else
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSger(ViennaCLBackend backend,
+                                                             ViennaCLOrder order,
+                                                             ViennaCLInt m, ViennaCLInt n,
+                                                             float alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                   size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                   size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+  mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDger(ViennaCLBackend backend,
+                                                             ViennaCLOrder order,
+                                                             ViennaCLInt m, ViennaCLInt n,
+                                                             double alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                    size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                    size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+  mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+  return ViennaCLSuccess;
+}
+


[07/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix.hpp
new file mode 100644
index 0000000..120f636
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix.hpp
@@ -0,0 +1,1193 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/scheduler/preset.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/device_specific/execution_handler.hpp"
+#include "viennacl/device_specific/builtin_database/matrix_product.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/matrix.hpp
+ *  @brief Runtime generation of OpenCL kernels for matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+/** @brief Enumeration for the scalar type in ambm-like operations */
+enum ambm_scalar_type
+{
+  VIENNACL_AMBM_NONE = 0, // matrix does not exist/contribute
+  VIENNACL_AMBM_CPU,
+  VIENNACL_AMBM_GPU
+};
+
+/** @brief Configuration struct for generating OpenCL kernels for linear combinations of matrices */
+struct ambm_config
+{
+  ambm_config() : with_stride_and_range(true), is_row_major(true), a(VIENNACL_AMBM_CPU), b(VIENNACL_AMBM_NONE) {}
+
+  bool with_stride_and_range;
+  bool is_row_major;
+  std::string      assign_op;
+  ambm_scalar_type a;
+  ambm_scalar_type b;
+};
+
+
+// just returns the for-loop
+template <typename StringType>
+void generate_ambm_impl2(StringType & source, ambm_config const & cfg, bool mult_alpha, bool mult_beta)
+{
+  if (cfg.is_row_major)
+  {
+    source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
+    source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
+    source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+    source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+  }
+  else
+  {
+    source.append("  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
+    source.append("  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
+    source.append("  for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+    source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+  }
+
+  if (cfg.with_stride_and_range)
+  {
+    if (cfg.is_row_major)
+      source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] ");
+    else
+      source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] ");
+    source.append(cfg.assign_op);
+    if (cfg.is_row_major)
+      source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] ");
+    else
+      source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] ");
+
+    if (mult_alpha)
+      source.append("* alpha ");
+    else
+      source.append("/ alpha ");
+    if (cfg.b != VIENNACL_AMBM_NONE)
+    {
+      if (cfg.is_row_major)
+        source.append("+ C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] ");
+      else
+        source.append("+ C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] ");
+      if (mult_beta)
+        source.append("* beta");
+      else
+        source.append("/ beta");
+    }
+  }
+  else
+  {
+    if (cfg.is_row_major)
+      source.append("    A[row * A_internal_size2 + col] ");
+    else
+      source.append("    A[row + col * A_internal_size1] ");
+    source.append(cfg.assign_op);
+    if (cfg.is_row_major)
+      source.append(" B[row * B_internal_size2 + col] ");
+    else
+      source.append(" B[row + col * B_internal_size1] ");
+
+    if (mult_alpha)
+      source.append("* alpha ");
+    else
+      source.append("/ alpha ");
+    if (cfg.b != VIENNACL_AMBM_NONE)
+    {
+      if (cfg.is_row_major)
+        source.append("+ C[row * C_internal_size2 + col] ");
+      else
+        source.append("+ C[row + col * C_internal_size2] ");
+      if (mult_beta)
+        source.append("* beta");
+      else
+        source.append("/ beta");
+    }
+  }
+  source.append("; \n");
+}
+
+template <typename StringType>
+void generate_ambm_impl(StringType & source, std::string const & numeric_string, ambm_config const & cfg)
+{
+  source.append("__kernel void am");
+  if (cfg.b != VIENNACL_AMBM_NONE)
+    source.append("bm");
+  if (cfg.assign_op != "=")
+    source.append("_m");
+
+  if (cfg.a == VIENNACL_AMBM_CPU)
+    source.append("_cpu");
+  else if (cfg.a == VIENNACL_AMBM_GPU)
+    source.append("_gpu");
+
+  if (cfg.b == VIENNACL_AMBM_CPU)
+    source.append("_cpu");
+  else if (cfg.b == VIENNACL_AMBM_GPU)
+    source.append("_gpu");
+  source.append("( \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
+  source.append("  unsigned int A_start1, unsigned int A_start2, \n");
+  source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
+  source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
+  source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+  if (cfg.a == VIENNACL_AMBM_CPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" fac2, \n");
+  }
+  else if (cfg.a == VIENNACL_AMBM_GPU)
+  {
+    source.append("  __global "); source.append(numeric_string); source.append(" * fac2, \n");
+  }
+  source.append("  unsigned int options2, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+  source.append("  __global const "); source.append(numeric_string); source.append(" * B, \n");
+  source.append("  unsigned int B_start1, unsigned int B_start2, \n");
+  source.append("  unsigned int B_inc1,   unsigned int B_inc2, \n");
+  source.append("  unsigned int B_internal_size1,  unsigned int B_internal_size2");
+
+  if (cfg.b != VIENNACL_AMBM_NONE)
+  {
+    source.append(", \n\n");
+    if (cfg.b == VIENNACL_AMBM_CPU)
+    {
+      source.append("  "); source.append(numeric_string); source.append(" fac3, \n");
+    }
+    else if (cfg.b == VIENNACL_AMBM_GPU)
+    {
+      source.append("  __global "); source.append(numeric_string); source.append(" * fac3, \n");
+    }
+    source.append("  unsigned int options3, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+    source.append("  __global const "); source.append(numeric_string); source.append(" * C, \n");
+    source.append("  unsigned int C_start1, unsigned int C_start2, \n");
+    source.append("  unsigned int C_inc1,   unsigned int C_inc2, \n");
+    source.append("  unsigned int C_internal_size1,  unsigned int C_internal_size2 \n");
+  }
+  source.append(") { \n");
+
+  if (cfg.a == VIENNACL_AMBM_CPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" alpha = fac2; \n");
+  }
+  else if (cfg.a == VIENNACL_AMBM_GPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
+  }
+  source.append("  if (options2 & (1 << 0)) \n");
+  source.append("    alpha = -alpha; \n");
+  source.append(" \n");
+
+  if (cfg.b == VIENNACL_AMBM_CPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" beta = fac3; \n");
+  }
+  else if (cfg.b == VIENNACL_AMBM_GPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
+  }
+  if (cfg.b != VIENNACL_AMBM_NONE)
+  {
+    source.append("  if (options3 & (1 << 0)) \n");
+    source.append("    beta = -beta; \n");
+    source.append(" \n");
+  }
+  source.append("  if (options2 & (1 << 1)) { \n");
+  if (cfg.b != VIENNACL_AMBM_NONE)
+  {
+    source.append("    if (options3 & (1 << 1)) {\n");
+    generate_ambm_impl2(source, cfg, false, false);
+    source.append("    } else {\n");
+    generate_ambm_impl2(source, cfg, false, true);
+    source.append("    } \n");
+  }
+  else
+    generate_ambm_impl2(source, cfg, false, true);
+  source.append("  } else { \n");
+  if (cfg.b != VIENNACL_AMBM_NONE)
+  {
+    source.append("    if (options3 & (1 << 1)) {\n");
+    generate_ambm_impl2(source, cfg, true, false);
+    source.append("    } else {\n");
+    generate_ambm_impl2(source, cfg, true, true);
+    source.append("    } \n");
+  }
+  else
+    generate_ambm_impl2(source, cfg, true, true);
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_ambm(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+  ambm_config cfg;
+  cfg.assign_op = "=";
+  cfg.with_stride_and_range = true;
+  cfg.is_row_major = is_row_major;
+
+  // am
+  cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+  cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+
+  // ambm
+  cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+
+  // ambm_m
+  cfg.assign_op = "+=";
+
+  cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+}
+
+template <typename StringType>
+void generate_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+  source.append("__kernel void assign_cpu( \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
+  source.append("  unsigned int A_start1, unsigned int A_start2, \n");
+  source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
+  source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
+  source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+  source.append("  "); source.append(numeric_string); source.append(" alpha) \n");
+  source.append("{ \n");
+  if (is_row_major)
+  {
+    source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
+    source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
+    source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+    source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+    source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha; \n");
+  }
+  else
+  {
+    source.append("  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
+    source.append("  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
+    source.append("  for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+    source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+    source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] = alpha; \n");
+  }
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_diagonal_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+  source.append("__kernel void diagonal_assign_cpu( \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
+  source.append("  unsigned int A_start1, unsigned int A_start2, \n");
+  source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
+  source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
+  source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+  source.append("  "); source.append(numeric_string); source.append(" alpha) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int idx = get_global_id(0); idx < min(A_size1, A_size2); idx += get_global_size(0))\n");
+  if (is_row_major)
+    source.append("    A[(idx * A_inc1 + A_start1) * A_internal_size2 + (idx * A_inc2 + A_start2)] = alpha; \n");
+  else
+    source.append("    A[(idx * A_inc1 + A_start1) + (idx * A_inc2 + A_start2) *  A_internal_size1] = alpha; \n");
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_element_op(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+  source.append("__kernel void element_op( \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
+  source.append("  unsigned int A_start1, unsigned int A_start2, \n");
+  source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
+  source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
+  source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * B, \n");
+  source.append("  unsigned int B_start1, unsigned int B_start2, \n");
+  source.append("  unsigned int B_inc1,   unsigned int B_inc2, \n");
+  source.append("  unsigned int B_internal_size1,  unsigned int B_internal_size2, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * C, \n");
+  source.append("  unsigned int C_start1, unsigned int C_start2, \n");
+  source.append("  unsigned int C_inc1,   unsigned int C_inc2, \n");
+  source.append("  unsigned int C_internal_size1,  unsigned int C_internal_size2, \n");
+  source.append("  unsigned int op_type) \n"); //0: product, 1: division, 2: pow
+  source.append("{ \n");
+  if (is_row_major)
+  {
+    source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
+    source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
+    source.append("  if (op_type == 2) {");
+    if (numeric_string == "float" || numeric_string == "double")
+    {
+      source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+      source.append("      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+      source.append("        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
+      source.append("        pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)], \n");
+      source.append("            C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]); \n");
+    }
+    source.append("  } else if (op_type == 1) {");
+    source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+    source.append("      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+    source.append("        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
+    source.append("        B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / \n");
+    source.append("        C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
+    source.append("  } else if (op_type == 0) {");
+    source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+    source.append("      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+    source.append("        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
+    source.append("        B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * \n");
+    source.append("        C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
+    source.append("  }");
+  }
+  else
+  {
+    source.append("  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
+    source.append("  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
+    source.append("  if (op_type == 2) {");
+    if (numeric_string == "float" || numeric_string == "double")
+    {
+      source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+      source.append("      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+      source.append("        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] =  \n");
+      source.append("          pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1], \n");
+      source.append("              C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]); \n");
+    }
+    source.append("  } else if (op_type == 1) {");
+    source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+    source.append("      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+    source.append("        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] =  \n");
+    source.append("          B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1] / \n");
+    source.append("          C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]; \n");
+    source.append("  } else if (op_type == 0) {");
+    source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+    source.append("      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+    source.append("        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] = \n");
+    source.append("          B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1] * \n");
+    source.append("          C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]; \n");
+    source.append("  }");
+  }
+  source.append("} \n");
+}
+
+
+template<typename StringT>
+void generate_fft(StringT & source, std::string const & numeric_string, bool is_row_major)
+{
+  // naive fourier transform (quadratic complexity, use for reference only)
+  source.append("__kernel void fft_direct(__global "); source.append(numeric_string); source.append("2 *input, \n");
+  source.append("                         __global "); source.append(numeric_string); source.append("2 *output, \n");
+  source.append("                         unsigned int size, \n");
+  source.append("                         unsigned int stride, \n");
+  source.append("                         unsigned int batch_num, \n");
+  source.append("                         "); source.append(numeric_string); source.append(" sign) { \n");
+  source.append("    const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+  source.append(" \n");
+  source.append("    for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
+  source.append("        for (unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
+  source.append("            "); source.append(numeric_string); source.append("2 f = 0.0f; \n");
+  source.append(" \n");
+  source.append("            for (unsigned int n = 0; n < size; n++) { \n");
+  source.append("                "); source.append(numeric_string); source.append("2 in = ");
+  if (is_row_major)
+    source.append("input[batch_id * stride + n]; \n"); //input index here
+  else
+    source.append("input[n * stride + batch_id]; \n"); //input index here
+  source.append(" \n");
+  source.append("                "); source.append(numeric_string); source.append(" sn, cs; \n");
+  source.append("                "); source.append(numeric_string); source.append(" arg = sign * 2 * NUM_PI * k / size * n; \n");
+  source.append("                sn = sincos(arg, &cs); \n");
+  source.append(" \n");
+  source.append("                "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
+  source.append("                f = f + ("); source.append(numeric_string); source.append("2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
+  source.append("            } \n");
+  source.append(" \n");
+  if (is_row_major)
+    source.append("            output[batch_id * stride + k] = f; \n"); // output index here
+  else
+    source.append("            output[k * stride + batch_id] = f; \n"); // output index here
+  source.append("        } \n");
+  source.append("    } \n");
+  source.append("} \n");
+
+  source.append(" \n"); //////////////////////////////
+
+  source.append("__kernel void fft_radix2(__global "); source.append(numeric_string); source.append("2* input, \n");
+  source.append("                         unsigned int s, \n");
+  source.append("                         unsigned int bit_size, \n");
+  source.append("                         unsigned int size, \n");
+  source.append("                         unsigned int stride, \n");
+  source.append("                         unsigned int batch_num, \n");
+  source.append("                         "); source.append(numeric_string); source.append(" sign) { \n");
+  source.append(" \n");
+  source.append("    unsigned int ss = 1 << s; \n");
+  source.append("    unsigned int half_size = size >> 1; \n");
+  source.append(" \n");
+  source.append("    "); source.append(numeric_string); source.append(" cs, sn; \n");
+  source.append("    const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+  source.append(" \n");
+  source.append("    unsigned int glb_id = get_global_id(0); \n");
+  source.append("    unsigned int glb_sz = get_global_size(0); \n");
+
+  source.append("    for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
+  source.append("        for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
+  source.append("            unsigned int group = (tid & (ss - 1)); \n");
+  source.append("            unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
+
+  if (is_row_major)
+  {
+    source.append("            unsigned int offset = batch_id * stride + pos; \n");
+    source.append("            "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
+    source.append("            "); source.append(numeric_string); source.append("2 in2 = input[offset + ss]; \n");//index
+  }
+  else
+  {
+    source.append("            unsigned int offset = pos * stride + batch_id; \n");
+    source.append("            "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
+    source.append("            "); source.append(numeric_string); source.append("2 in2 = input[offset + ss * stride]; \n");//index
+  }
+
+  source.append("            "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
+
+  source.append("            sn = sincos(arg, &cs); \n");
+
+  source.append("            "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
+
+  source.append("            "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
+
+  if (is_row_major)
+    source.append("            input[offset + ss] = in1 - tmp; \n");//index
+  else
+    source.append("            input[offset + ss * stride] = in1 - tmp; \n");//index
+  source.append("            input[offset] = in1 + tmp; \n");//index
+  source.append("        } \n");
+  source.append("    } \n");
+  source.append("} \n");
+
+  source.append(" \n"); //////////////////////////////
+
+  source.append(" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
+  source.append("     v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
+  source.append("     v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
+  source.append("     v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
+  source.append("     v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
+  source.append("     v = (v >> 16) | (v << 16); \n");
+  source.append("  \n");
+  source.append("     v = v >> (32 - bit_size); \n");
+  source.append("  \n");
+  source.append("     return v; \n");
+  source.append(" } \n");
+
+  source.append(" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append("2* input, \n");
+  source.append("                                 __local "); source.append(numeric_string); source.append("2* lcl_input, \n");
+  source.append("                                 unsigned int bit_size, \n");
+  source.append("                                 unsigned int size, \n");
+  source.append("                                 unsigned int stride, \n");
+  source.append("                                 unsigned int batch_num, \n");
+  source.append("                                 "); source.append(numeric_string); source.append(" sign) { \n");
+
+  source.append("     unsigned int grp_id = get_group_id(0); \n");
+  source.append("     unsigned int grp_num = get_num_groups(0); \n");
+
+  source.append("     unsigned int lcl_sz = get_local_size(0); \n");
+  source.append("     unsigned int lcl_id = get_local_id(0); \n");
+  source.append("     const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+
+  source.append("     for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
+          //unsigned int base_offset = stride * batch_id; \n");
+          //copy chunk of global memory to local \n");
+  source.append("         for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
+  source.append("             unsigned int v = get_reorder_num(p, bit_size); \n");
+  if (is_row_major)
+    source.append("             lcl_input[v] = input[batch_id * stride + p]; \n"); //index
+  else
+    source.append("             lcl_input[v] = input[p * stride + batch_id]; \n"); //index
+  source.append("         } \n");
+
+  source.append("         barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          //performs Cooley-Tukey FFT on local array
+  source.append("         for (unsigned int s = 0; s < bit_size; s++) { \n");
+  source.append("             unsigned int ss = 1 << s; \n");
+
+  source.append("             "); source.append(numeric_string); source.append(" cs, sn; \n");
+
+  source.append("             for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
+  source.append("                 unsigned int group = (tid & (ss - 1)); \n");
+  source.append("                 unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
+
+  source.append("                 "); source.append(numeric_string); source.append("2 in1 = lcl_input[pos]; \n");
+  source.append("                 "); source.append(numeric_string); source.append("2 in2 = lcl_input[pos + ss]; \n");
+
+  source.append("                 "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
+
+  source.append("                 sn = sincos(arg, &cs); \n");
+  source.append("                 "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
+
+  source.append("                 "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
+
+  source.append("                 lcl_input[pos + ss] = in1 - tmp; \n");
+  source.append("                 lcl_input[pos] = in1 + tmp; \n");
+  source.append("             } \n");
+
+  source.append("             barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("         } \n");
+
+          //copy local array back to global memory
+  source.append("         for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
+  if (is_row_major)
+    source.append("             input[batch_id * stride + p] = lcl_input[p]; \n");//index
+  else
+    source.append("             input[p * stride + batch_id] = lcl_input[p]; \n");//index
+  source.append("         } \n");
+  source.append("     } \n");
+  source.append(" } \n");
+
+  source.append(" \n"); //////////////////////////////
+
+  //
+  // Performs reordering of input data in bit-reversal order
+  // Probably it's better to do in host side,
+  //
+  source.append("unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
+  source.append("    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
+  source.append("    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
+  source.append("    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
+  source.append("    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
+  source.append("    v = (v >> 16) | (v << 16); \n");
+
+  source.append("    v = v >> (32 - bit_size); \n");
+
+  source.append("    return v; \n");
+  source.append("} \n");
+
+  source.append("__kernel void fft_reorder(__global "); source.append(numeric_string); source.append("2* input, \n");
+  source.append("                          unsigned int bit_size, \n");
+  source.append("                          unsigned int size, \n");
+  source.append("                          unsigned int stride, \n");
+  source.append("                          int batch_num) { \n");
+
+  source.append("    unsigned int glb_id = get_global_id(0); \n");
+  source.append("    unsigned int glb_sz = get_global_size(0); \n");
+
+  source.append("    for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
+  source.append("        for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
+  source.append("            unsigned int v = get_reorder_num_2(i, bit_size); \n");
+
+  source.append("            if (i < v) {\n");
+  if (is_row_major)
+  {
+    source.append("                "); source.append(numeric_string); source.append("2 tmp = input[batch_id * stride + i]; \n"); // index
+    source.append("                input[batch_id * stride + i] = input[batch_id * stride + v]; \n"); //index
+    source.append("                input[batch_id * stride + v] = tmp; \n"); //index
+  }
+  else
+  {
+    source.append("                "); source.append(numeric_string); source.append("2 tmp = input[i * stride + batch_id]; \n"); // index
+    source.append("                input[i * stride + batch_id] = input[v * stride + batch_id]; \n"); //index
+    source.append("                input[v * stride + batch_id] = tmp; \n"); //index
+  }
+  source.append("            } \n");
+  source.append("        } \n");
+  source.append("    } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_lu(StringT & source, std::string const & numeric_string, bool is_row_major)
+{
+  source.append("__kernel void lu_factorize( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * matrix, \n");
+  source.append("          unsigned int matrix_rows, \n");
+  source.append("          unsigned int matrix_cols, \n");
+  source.append("          unsigned int matrix_internal_rows, \n");
+  source.append("          unsigned int matrix_internal_cols) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" temp; \n");
+
+  if (is_row_major)
+  {
+    source.append("  unsigned rowi; \n");
+    source.append("  unsigned rowk; \n");
+    source.append("  for (unsigned int i=1; i<matrix_rows; ++i) \n");
+    source.append("  { \n");
+    source.append("    rowi = i * matrix_internal_cols; \n");
+    source.append("    for (unsigned int k=0; k<i; ++k) \n");
+    source.append("    { \n");
+    source.append("      rowk = k * matrix_internal_cols; \n");
+    source.append("      if (get_global_id(0) == 0) \n");
+    source.append("        matrix[rowi + k] /= matrix[rowk + k]; \n");
+
+    source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+    source.append("      temp = matrix[rowi + k]; \n");
+
+    //parallel subtraction:
+    source.append("      for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
+    source.append("        matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
+  }
+  else
+  {
+    source.append("      for (unsigned int i=1; i<matrix_rows; ++i) \n");
+    source.append("      { \n");
+    source.append("        for (unsigned int k=0; k<i; ++k) \n");
+    source.append("        { \n");
+
+    source.append("          if (get_global_id(0) == 0) \n");
+    source.append("            matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
+
+    source.append("          barrier(CLK_GLOBAL_MEM_FENCE); \n");
+    source.append("          temp = matrix[i + k*matrix_internal_rows]; \n");
+
+    //parallel subtraction:
+    source.append("          for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
+    source.append("            matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
+  }
+  source.append("   }");
+  source.append("  }");
+  source.append("}");
+}
+
+
+template<typename StringT>
+void generate_scaled_rank1_update(StringT & source, std::string const & numeric_string, bool is_row_major, bool alpha_on_cpu)
+{
+  source.append("__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append("cpu") : source.append("gpu"); source.append("( \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
+  source.append("  unsigned int A_start1, unsigned int A_start2, \n");
+  source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
+  source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
+  source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+
+  if (alpha_on_cpu) {
+    source.append("  "); source.append(numeric_string); source.append(" val, \n");
+  } else {
+    source.append("  __global const "); source.append(numeric_string); source.append(" *val, \n");
+  }
+  source.append("  unsigned int options2, \n");
+
+  source.append("  __global const "); source.append(numeric_string); source.append(" * vec1, \n");
+  source.append("  unsigned int start1, \n");
+  source.append("  unsigned int inc1, \n");
+  source.append("  unsigned int size1, \n");
+
+  source.append("  __global const "); source.append(numeric_string); source.append(" * vec2, \n");
+  source.append("  unsigned int start2, \n");
+  source.append("  unsigned int inc2, \n");
+  source.append("  unsigned int size2) \n");
+  source.append("{ \n");
+
+  if (alpha_on_cpu) {
+    source.append("  "); source.append(numeric_string); source.append(" alpha = val; \n");
+  } else {
+    source.append("  "); source.append(numeric_string); source.append(" alpha = val[0]; \n");
+  }
+  source.append("  if (options2 & (1 << 0)) \n");
+  source.append("    alpha = -alpha; \n");
+
+  source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
+  source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
+
+  source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" tmp = vec1[row * inc1 + start1];");
+  source.append("    tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
+  source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
+  if (is_row_major)
+    source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
+  else
+    source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_trans_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+  source.append("__kernel void trans_vec_mul( \n");
+  source.append("          __global const "); source.append(numeric_string); source.append(" * A, \n");
+  source.append("          unsigned int A_row_start, unsigned int A_col_start, \n");
+  source.append("          unsigned int A_row_inc, unsigned int A_col_inc, \n");
+  source.append("          unsigned int A_row_size, unsigned int A_col_size, \n");
+  source.append("          unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
+  source.append("          __global const "); source.append(numeric_string); source.append(" * v, \n");
+  source.append("          unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("          unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
+  source.append("          __local "); source.append(numeric_string); source.append(" * work) \n");
+  source.append("{ \n");
+  if (is_row_major)
+  {
+    source.append("  for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0)) \n");
+    source.append("  { \n");
+    source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+    source.append("    for (unsigned int col = 0; col < A_row_size; ++col) \n");
+    source.append("      dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; \n");
+    source.append("    result[row * result_inc + result_start] = dot_prod; \n");
+  }
+  else
+  {
+    source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
+    source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
+    source.append("  unsigned int lid = get_local_id(0); \n");
+
+    source.append("  for (unsigned int row = row_gid; row < A_col_size; row += get_num_groups(0)) \n");
+    source.append("  { \n");
+    source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+    source.append("    for (unsigned int col = col_gid; col < A_row_size; col+=get_local_size(0)) \n");
+    source.append("      dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col]; \n");
+    source.append("    work[lid] = dot_prod; \n");
+
+    source.append("    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
+    source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+    source.append("      if(lid < stride) \n");
+    source.append("        work[lid] += work[lid+stride]; \n");
+    source.append("    } \n");
+
+    source.append("    if(lid == 0) \n");
+    source.append("      result[row * result_inc + result_start] = work[0]; \n");
+  }
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_triangular_substitute_inplace(StringT & source, std::string const & numeric_string, bool is_row_major)
+{
+  source.append("__kernel void triangular_substitute_inplace( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * A, \n");
+  source.append("          unsigned int A_start1, unsigned int A_start2, \n");
+  source.append("          unsigned int A_inc1,   unsigned int A_inc2, \n");
+  source.append("          unsigned int A_size1,  unsigned int A_size2, \n");
+  source.append("          unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * v, \n");
+  source.append("          unsigned int v_start, \n");
+  source.append("          unsigned int v_inc, \n");
+  source.append("          unsigned int v_size, \n");
+  source.append("          unsigned int options) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" temp; \n");
+  source.append("  unsigned int unit_diagonal_flag  = (options & (1 << 0)); \n");
+  source.append("  unsigned int transposed_access_A = (options & (1 << 1)); \n");
+  source.append("  unsigned int is_lower_solve      = (options & (1 << 2)); \n");
+  source.append("  unsigned int row; \n");
+  source.append("  for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)  \n");   //Note: A required to be square
+  source.append("  { \n");
+  source.append("    row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("    if (!unit_diagonal_flag) \n");
+  source.append("    { \n");
+  source.append("      if (get_global_id(0) == 0) \n");
+  if (is_row_major)
+    source.append("        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
+  else
+    source.append("        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
+  source.append("   } \n");
+
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+  source.append("    temp = v[row * v_inc + v_start]; \n");
+
+  source.append("    for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
+  source.append("             elim < (is_lower_solve ? A_size1 : row); \n");
+  source.append("             elim += get_global_size(0)) \n");
+  if (is_row_major)
+  {
+    source.append("      v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
+    source.append("                                                                : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row  * A_inc2 + A_start2))]; \n");
+  }
+  else
+  {
+    source.append("      v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
+    source.append("                                                                : ((elim * A_inc1 + A_start1) + (row  * A_inc2 + A_start2) * A_internal_size1)]; \n");
+  }
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template <typename StringT>
+void generate_trans_kernel(StringT & source, std::string const & numeric_string, bool is_row_major)
+{
+  source.append("__kernel void trans_kernel(\n");
+  source.append("           __global const ");source.append(numeric_string);source.append(" * A, \n");
+  source.append("           unsigned int A_start1,          unsigned int A_start2, \n");
+  source.append("           unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+  source.append("           unsigned int A_size1,           unsigned int A_size2, \n");
+  source.append("           unsigned int A_stride1,         unsigned int A_stride2, \n");
+  source.append("           __global ");source.append(numeric_string);source.append(" * B, \n");
+  source.append("           unsigned int B_start1,          unsigned int B_start2, \n");
+  source.append("           unsigned int B_internal_size1,  unsigned int B_internal_size2, \n");
+  source.append("           unsigned int B_stride1,         unsigned int B_stride2) \n");
+  source.append("{ \n");
+  source.append("  for(unsigned int row = get_group_id(0); row < A_size1; row += get_num_groups(0))\n");
+  source.append("  {  \n");
+  source.append("    for(unsigned int col = get_local_id(0); col < A_size2; col += get_local_size(0))\n");
+  source.append("    {  \n");
+  if(is_row_major)
+    source.append("      B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)];  \n");
+  else
+    source.append("      B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1];  \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("}  \n");
+}
+
+template <typename StringType>
+void generate_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+  source.append("__kernel void vec_mul( \n");
+  source.append("          __global const "); source.append(numeric_string); source.append(" * A, \n");
+  source.append("          unsigned int A_row_start, unsigned int A_col_start, \n");
+  source.append("          unsigned int A_row_inc, unsigned int A_col_inc, \n");
+  source.append("          unsigned int A_row_size, unsigned int A_col_size, \n");
+  source.append("          unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
+  source.append("          __global const "); source.append(numeric_string); source.append(" * v, \n");
+  source.append("          unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("          unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
+  source.append("          __local "); source.append(numeric_string); source.append(" * work) \n");
+  source.append("{ \n");
+  if (is_row_major)
+  {
+    source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
+    source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
+    source.append("  unsigned int lid = get_local_id(0); \n");
+
+    source.append("  for (unsigned int row = row_gid; row < A_row_size; row += get_num_groups(0)) \n");
+    source.append("  { \n");
+    source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+    source.append("    for (unsigned int col = col_gid; col < A_col_size; col+=get_local_size(0)) \n");
+    source.append("      dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; \n");
+    source.append("    work[lid] = dot_prod; \n");
+
+    source.append("    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
+    source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+    source.append("      if(lid < stride) \n");
+    source.append("        work[lid] += work[lid+stride]; \n");
+    source.append("    } \n");
+
+    source.append("    if(lid == 0) \n");
+    source.append("      result[row * result_inc + result_start] = work[0]; \n");
+
+  }
+  else
+  {
+    source.append("    for (unsigned int row = get_global_id(0); row < A_row_size; row += get_global_size(0)) \n");
+    source.append("    { \n");
+    source.append("      "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+    source.append("      for (unsigned int col = 0; col < A_col_size; ++col) \n");
+    source.append("        dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col]; \n");
+    source.append("      result[row * result_inc + result_start] = dot_prod; \n");
+  }
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+namespace detail
+{
+  inline std::string type_to_string(viennacl::row_major)    { return "row"; }
+  inline std::string type_to_string(viennacl::column_major) { return "col"; }
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type viennacl::matrix<>. */
+template <typename NumericT, typename F>
+struct matrix
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_" + detail::type_to_string(F());
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+    std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+    bool is_row_major = viennacl::is_row_major<F>::value;
+
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      std::string source;
+      source.reserve(8192);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // fully parametrized kernels:
+      generate_ambm(source, numeric_string, is_row_major);
+
+      // kernels with mostly predetermined skeleton:
+      generate_assign_cpu(source, numeric_string, is_row_major);
+      generate_diagonal_assign_cpu(source, numeric_string, is_row_major);
+      generate_element_op(source, numeric_string, is_row_major);
+      generate_trans_vec_mul(source, numeric_string, is_row_major);
+      generate_vec_mul(source, numeric_string, is_row_major);
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+/** @brief Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without involving matrices, multiple inner products, or element-wise operations other than addition or subtraction. */
+template<typename NumericT>
+class matrix_prod
+{
+public:
+  static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context & ctx)
+  {
+    static std::map<std::pair<bool, cl_context>, device_specific::execution_handler> handlers_map;
+    cl_context h = ctx.handle().get();
+    std::pair<bool, cl_context> key(is_row_major, h);
+    if (handlers_map.find(key) == handlers_map.end())
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+
+      namespace ds = viennacl::device_specific;
+      viennacl::ocl::device const & device = ctx.current_device();
+      std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + (is_row_major?"_matrix_prod_row":"_matrix_prod_col");
+      handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
+      ds::execution_handler & handler = viennacl::device_specific::at(handlers_map, key);
+
+      ds::matrix_product_template::parameters_type matrix_product_params_NN = ds::builtin_database::matrix_product_params<NumericT>(device, 'N', 'N');
+      ds::matrix_product_template::parameters_type matrix_product_params_TN = ds::builtin_database::matrix_product_params<NumericT>(device, 'T', 'N');
+      ds::matrix_product_template::parameters_type matrix_product_params_NT = ds::builtin_database::matrix_product_params<NumericT>(device, 'N', 'T');
+      ds::matrix_product_template::parameters_type matrix_product_params_TT = ds::builtin_database::matrix_product_params<NumericT>(device, 'T', 'T');
+
+      tools::shared_ptr<viennacl::matrix_base<NumericT> > pC;
+      if (is_row_major)
+        pC.reset(new viennacl::matrix<NumericT, viennacl::row_major>());
+      else
+        pC.reset(new viennacl::matrix<NumericT, viennacl::column_major>());
+
+      //Dummy types. The values don't matter for the kernel generation.
+      viennacl::matrix_base<NumericT>& C = *pC;
+      viennacl::matrix<NumericT, viennacl::column_major> A;
+      viennacl::matrix<NumericT, viennacl::column_major> B;
+      NumericT alpha = 1;
+      NumericT beta = 0;
+
+      handler.add("prod_NN", ds::matrix_product_template(matrix_product_params_NN, 'N', 'N'), scheduler::preset::mat_mat_prod(alpha, &A, false, &B, false, beta, &C));
+      handler.add("prod_TN", ds::matrix_product_template(matrix_product_params_TN, 'T', 'N'), scheduler::preset::mat_mat_prod(alpha, &A, true, &B, false, beta, &C));
+      handler.add("prod_NT", ds::matrix_product_template(matrix_product_params_NT, 'N', 'T'), scheduler::preset::mat_mat_prod(alpha, &A, false, &B, true, beta, &C));
+      handler.add("prod_TT", ds::matrix_product_template(matrix_product_params_TT, 'T', 'T'), scheduler::preset::mat_mat_prod(alpha, &A, true, &B, true, beta, &C));
+
+    }
+  return viennacl::device_specific::at(handlers_map, key);
+  }
+};
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type viennacl::matrix<>. */
+template<typename NumericT, typename LayoutT>
+struct matrix_legacy
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_legacy_" + detail::type_to_string(LayoutT());
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+      bool is_row_major = viennacl::is_row_major<LayoutT>::value;
+
+      std::string source;
+      source.reserve(8192);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // kernels with mostly predetermined skeleton:
+      generate_scaled_rank1_update(source, numeric_string, is_row_major, true);
+      generate_scaled_rank1_update(source, numeric_string, is_row_major, false);
+
+      if (numeric_string == "float" || numeric_string == "double")
+      {
+        generate_fft(source, numeric_string, is_row_major);
+        generate_lu(source, numeric_string, is_row_major);
+        generate_triangular_substitute_inplace(source, numeric_string, is_row_major);
+        generate_trans_kernel(source, numeric_string, is_row_major);
+      }
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+
+
+
+template<typename StringT>
+void generate_matrix_convert_row(StringT & source, std::string const & dest_type, std::string const & src_type)
+{
+ source.append(" __kernel void convert_row_" + dest_type + "_" + src_type + "( \n");
+ source.append("  __global " + dest_type + " * dest, \n");
+ source.append("  unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
+ source.append("  unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
+ source.append("  __global const " + src_type + " * src, \n");
+ source.append("  unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
+ source.append("  unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
+ source.append("  { \n");
+ source.append("   for (unsigned int i = get_group_id(0); i < size1_dest; i += get_num_groups(0)) \n");
+ source.append("     for (unsigned int j = get_local_id(0); j < size2_dest; j += get_local_size(0)) \n");
+ source.append("       dest[(start1_dest + i * inc1_dest) * internal_size2_dest + (start2_dest + j * inc2_dest)] = src[(start1_src + i * inc1_src) * internal_size2_src + (start2_src + j * inc2_src)]; \n");
+ source.append("  } \n");
+}
+
+template<typename StringT>
+void generate_matrix_convert_col(StringT & source, std::string const & dest_type, std::string const & src_type)
+{
+  source.append(" __kernel void convert_col_" + dest_type + "_" + src_type + "( \n");
+  source.append("  __global " + dest_type + " * dest, \n");
+  source.append("  unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
+  source.append("  unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
+  source.append("  __global const " + src_type + " * src, \n");
+  source.append("  unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
+  source.append("  unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
+  source.append("  { \n");
+  source.append("   for (unsigned int j = get_group_id(0); j < size2_dest; j += get_num_groups(0)) \n");
+  source.append("     for (unsigned int i = get_local_id(0); i < size1_dest; i += get_local_size(0)) \n");
+  source.append("       dest[(start1_dest + i * inc1_dest) + (start2_dest + j * inc2_dest) * internal_size1_dest] = src[(start1_src + i * inc1_src) + (start2_src + j * inc2_src) * internal_size1_src]; \n");
+  source.append("  } \n");
+}
+
+template<typename StringT>
+void generate_matrix_convert(StringT & source, std::string const & dest_type, std::string const & src_type)
+{
+  generate_matrix_convert_row(source, dest_type, src_type);
+  generate_matrix_convert_col(source, dest_type, src_type);
+}
+
+/** @brief Main kernel class for vector conversion routines (e.g. convert vector<int> to vector<float>). */
+struct matrix_convert
+{
+
+public:
+  static std::string program_name()
+  {
+    return "matrix_convert";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      std::string source;
+      source.reserve(4096);
+
+      // int
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<int>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<long>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+      // unsigned int
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<int>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<long>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+      // long
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<int>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<long>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+      // unsigned long
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<int>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<long>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+      // float
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<int>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<long>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+      generate_matrix_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+      if (ctx.current_device().double_support())
+      {
+        viennacl::ocl::append_double_precision_pragma<double>(ctx, source);
+
+        generate_matrix_convert(source, viennacl::ocl::type_to_string<int>::apply(),           viennacl::ocl::type_to_string<double>::apply());
+        generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(),  viennacl::ocl::type_to_string<double>::apply());
+        generate_matrix_convert(source, viennacl::ocl::type_to_string<long>::apply(),          viennacl::ocl::type_to_string<double>::apply());
+        generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<double>::apply());
+        generate_matrix_convert(source, viennacl::ocl::type_to_string<float>::apply(),         viennacl::ocl::type_to_string<double>::apply());
+
+        generate_matrix_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<int>::apply());
+        generate_matrix_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+        generate_matrix_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<long>::apply());
+        generate_matrix_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+        generate_matrix_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<float>::apply());
+        generate_matrix_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<double>::apply());
+      }
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+
+};
+
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_element.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_element.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_element.hpp
new file mode 100644
index 0000000..d3b684f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_element.hpp
@@ -0,0 +1,138 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_ELEMENT_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_ELEMENT_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/matrix_element.hpp
+ *  @brief OpenCL kernel file for element-wise matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+
+//generate code for C = op1(A) * op2(B), where A, B, C can have different storage layouts and opX(D) = D or trans(D)
+template <typename StringType>
+void generate_matrix_unary_element_ops(StringType & source, std::string const & numeric_string,
+                                       std::string const & funcname, std::string const & op, std::string const & op_name, bool is_row_major)
+{
+  source.append("__kernel void "); source.append(funcname); source.append("_"); source.append(op_name); source.append("(\n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * A, \n");
+  source.append("          unsigned int A_start1, unsigned int A_start2, \n");
+  source.append("          unsigned int A_inc1,   unsigned int A_inc2, \n");
+  source.append("          unsigned int A_size1,  unsigned int A_size2, \n");
+  source.append("          unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+
+  source.append("          __global const "); source.append(numeric_string); source.append(" * B, \n");
+  source.append("          unsigned int B_start1, unsigned int B_start2, \n");
+  source.append("          unsigned int B_inc1,   unsigned int B_inc2, \n");
+  source.append("          unsigned int B_internal_size1,  unsigned int B_internal_size2) { \n");
+
+  if (is_row_major)
+  {
+    source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
+    source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
+
+    source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
+    source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
+    source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] \n");
+    source.append("        "); source.append(op); source.append(" "); source.append(funcname); source.append("(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); \n");
+  }
+  else
+  {
+    source.append("  unsigned int row_gid = get_global_id(0) % get_local_size(0); \n");
+    source.append("  unsigned int col_gid = get_global_id(0) / get_local_size(0); \n");
+
+    source.append("  for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0)) \n");
+    source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0)) \n");
+    source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] \n");
+    source.append("        "); source.append(op); source.append(" "); source.append(funcname); source.append("(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); \n");
+  }
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_matrix_unary_element_ops(StringType & source, std::string const & numeric_string, std::string const & funcname, bool is_row_major)
+{
+  generate_matrix_unary_element_ops(source, numeric_string, funcname, "=", "assign", is_row_major);
+  //generate_matrix_unary_element_ops(source, numeric_string, funcname, "+=", "plus", is_row_major);
+  //generate_matrix_unary_element_ops(source, numeric_string, funcname, "-=", "minus", is_row_major);
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for elementwise-operations such as element_sin() on/with dense matrix objects of type viennacl::matrix<>. */
+template <typename NumericT, typename F>
+struct matrix_element
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_element_" + detail::type_to_string(F());
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+    std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      std::string source;
+      source.reserve(8192);
+      bool is_row_major = viennacl::is_row_major<F>::value;
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // unary operations
+      if (numeric_string == "float" || numeric_string == "double")
+      {
+        generate_matrix_unary_element_ops(source, numeric_string, "acos",  is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "asin",  is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "atan",  is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "ceil",  is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "cos",   is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "cosh",  is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "exp",   is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "fabs",  is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "floor", is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "log",   is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "log10", is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "sin",   is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "sinh",  is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "sqrt",  is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "tan",   is_row_major);
+        generate_matrix_unary_element_ops(source, numeric_string, "tanh",  is_row_major);
+      }
+      else
+      {
+        generate_matrix_unary_element_ops(source, numeric_string, "abs", is_row_major);
+      }
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_solve.hpp
new file mode 100644
index 0000000..f25a7a7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_solve.hpp
@@ -0,0 +1,180 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_SOLVE_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_SOLVE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/matrix_solve.hpp
+ *  @brief OpenCL kernel file for dense matrix solves with multiple right hand side (BLAS level 3) */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+template<typename StringT>
+void generate_matrix_solve_blas3(StringT & source, std::string const & numeric_string,
+                                 bool row_major_A, bool row_major_B,
+                                 bool upper_solve, bool unit_diagonal)
+{
+  //start OpenCL code:
+  source.append("__kernel void ");
+  if (unit_diagonal)
+    source.append("unit_");
+  if (upper_solve)
+    source.append("upper_");
+  else
+    source.append("lower_");
+  source.append("solve");
+
+  source.append("( \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * A, \n");
+  source.append("  unsigned int A_start1, unsigned int A_start2, \n");
+  source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
+  source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
+  source.append("  unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * B, \n");
+  source.append("  unsigned int B_start1, unsigned int B_start2, \n");
+  source.append("  unsigned int B_inc1,   unsigned int B_inc2, \n");
+  source.append("  unsigned int B_size1,  unsigned int B_size2, \n");
+  source.append("  unsigned int B_internal_size1, unsigned int B_internal_size2) { \n");
+  source.append("  "); source.append(numeric_string); source.append(" temp;  \n");
+  if (upper_solve)
+  {
+    //Note: A is square, thus A_rows == A_cols and no dispatch for transposedness needed
+    source.append("  for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)  \n");
+    source.append("  {  \n");
+    source.append("    unsigned int row = A_size1 - 1 - row_cnt; \n");
+  }
+  else //lower triangular solve
+  {
+    source.append("  for (unsigned int row = 0; row < A_size1; ++row) \n");
+    source.append("  { \n");
+  }
+
+  if (!unit_diagonal)
+  {
+    source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+    source.append("    if (get_local_id(0) == 0)  \n");
+    //Note: A is square, thus A_internal_rows == A_internal_cols and no dispatch for transposedness needed
+    if (row_major_B)
+      source.append("      B[(row * B_inc1 + B_start1) * B_internal_size2 + (get_group_id(0) * B_inc2 + B_start2)] /= ");
+    else
+      source.append("      B[(row * B_inc1 + B_start1) + (get_group_id(0) * B_inc2 + B_start2) * B_internal_size1] /= ");
+
+    if (row_major_A)
+      source.append("A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
+    else
+      source.append("A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; \n");
+  }
+
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+  if (row_major_B)
+    source.append("    temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (get_group_id(0) * B_inc2 + B_start2)]; \n");
+  else
+    source.append("    temp = B[(row * B_inc1 + B_start1) + (get_group_id(0) * B_inc2 + B_start2) * B_internal_size1]; \n");
+
+  source.append("    //eliminate column of op(A) with index 'row' in parallel: \n");
+  if (upper_solve)
+    source.append("    for  (unsigned int elim = get_local_id(0); elim < row; elim += get_local_size(0)) \n");
+  else
+    source.append("    for  (unsigned int elim = row + get_local_id(0) + 1; elim < A_size1; elim += get_local_size(0)) \n");
+
+  if (row_major_B)
+    source.append("      B[(elim * B_inc1 + B_start1) * B_internal_size2 + (get_group_id(0) * B_inc2 + B_start2)] -= temp * ");
+  else
+    source.append("      B[(elim * B_inc1 + B_start1) + (get_group_id(0) * B_inc2 + B_start2) * B_internal_size1] -= temp * ");
+
+  if (row_major_A)
+    source.append("A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
+  else
+    source.append("A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
+
+  source.append("   } \n");
+  source.append("} \n");
+}
+
+
+// main kernel class
+/** @brief Main kernel class for the generation of matrix solve kernels.
+  *
+  * @param F1  Row/Column majority tag for the system matrix
+  * @param F2  Row/Column majority tag for the right hand side matrix
+  */
+template<typename NumericT, typename LayoutT1, typename LayoutT2>
+struct matrix_solve
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_solve_" + detail::type_to_string(LayoutT1()) + detail::type_to_string(LayoutT2());
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+      bool matrix_row_major = viennacl::is_row_major<LayoutT1>::value;
+      bool rhs_row_major    = viennacl::is_row_major<LayoutT2>::value;
+
+      std::string source;
+      source.reserve(8192);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // only generate for floating points (forces error for integers)
+      if (numeric_string == "float" || numeric_string == "double")
+      {
+        generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                    false, false);
+        generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                    false, true);
+        generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                    true, false);
+        generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                    true, true);
+      }
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/nmf.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/nmf.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/nmf.hpp
new file mode 100644
index 0000000..46cb419
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/nmf.hpp
@@ -0,0 +1,99 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_NMF_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_NMF_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/nmf.hpp
+ *  @brief OpenCL kernel file for nonnegative matrix factorization */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+template<typename StringT>
+void generate_nmf_el_wise_mul_div(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void el_wise_mul_div( \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * matrix1, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * matrix2, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * matrix3, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" val = matrix1[i] * matrix2[i]; \n");
+  source.append("    "); source.append(numeric_string); source.append(" divisor = matrix3[i]; \n");
+  source.append("    matrix1[i] = (divisor > ("); source.append(numeric_string); source.append(")0.00001) ? (val / divisor) : ("); source.append(numeric_string); source.append(")0; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for nonnegative matrix factorization of a dense matrices. */
+template<typename NumericT>
+struct nmf
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_nmf";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(8192);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // only generate for floating points (forces error for integers)
+      if (numeric_string == "float" || numeric_string == "double")
+      {
+        generate_nmf_el_wise_mul_div(source, numeric_string);
+      }
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+


[03/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/sparse_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/sparse_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/sparse_matrix_operations.hpp
new file mode 100644
index 0000000..a8d1557
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/sparse_matrix_operations.hpp
@@ -0,0 +1,1244 @@
+#ifndef VIENNACL_LINALG_OPENCL_SPARSE_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_SPARSE_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/sparse_matrix_operations.hpp
+    @brief Implementations of operations using sparse matrices and OpenCL
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/opencl/kernels/compressed_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/coordinate_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/ell_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/hyb_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/vector_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+//
+// Compressed matrix
+//
+
+namespace detail
+{
+  template<typename NumericT, unsigned int AlignmentV>
+  void row_info(compressed_matrix<NumericT, AlignmentV> const & A,
+                vector_base<NumericT> & x,
+                viennacl::linalg::detail::row_info_types info_selector)
+  {
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+    viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+    viennacl::ocl::kernel & row_info_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "row_info_extractor");
+
+    viennacl::ocl::enqueue(row_info_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+                                           viennacl::traits::opencl_handle(x),
+                                           cl_uint(A.size1()),
+                                           cl_uint(info_selector)
+                                          )
+                          );
+  }
+}
+
+/** @brief Carries out matrix-vector multiplication with a compressed_matrix
+*
+* Implementation of the convenience expression y = prod(A, x);
+*
+* @param A    The matrix
+* @param x    The vector
+* @param y the result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & A,
+               const viennacl::vector_base<NumericT> & x,
+               NumericT alpha,
+                     viennacl::vector_base<NumericT> & y,
+               NumericT beta)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+  bool use_nvidia_specific = AlignmentV == 1 && ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0);
+  bool with_alpha_beta = (alpha < NumericT(1) || alpha > NumericT(1)) || (beta < 0 || beta > 0);
+
+
+  std::stringstream ss;
+  ss << "vec_mul";
+  unsigned int alignment = AlignmentV; //prevent unreachable code warnings below
+  if (use_nvidia_specific)
+    ss << "_nvidia";
+  else
+  {
+    if (alignment == 4)
+      ss << "4";
+    if (alignment == 8)
+      ss << "8";
+  }
+
+  if (with_alpha_beta)
+    ss << "_alpha_beta";
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), ss.str());
+
+  viennacl::ocl::packed_cl_uint layout_x;
+  layout_x.start  = cl_uint(viennacl::traits::start(x));
+  layout_x.stride = cl_uint(viennacl::traits::stride(x));
+  layout_x.size   = cl_uint(viennacl::traits::size(x));
+  layout_x.internal_size   = cl_uint(viennacl::traits::internal_size(x));
+
+  viennacl::ocl::packed_cl_uint layout_y;
+  layout_y.start  = cl_uint(viennacl::traits::start(y));
+  layout_y.stride = cl_uint(viennacl::traits::stride(y));
+  layout_y.size   = cl_uint(viennacl::traits::size(y));
+  layout_y.internal_size   = cl_uint(viennacl::traits::internal_size(y));
+
+  if (alignment == 4 || alignment == 8)
+  {
+    if (with_alpha_beta)
+      viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+                               x, layout_x,
+                               alpha,
+                               y, layout_y,
+                               beta
+                              ));
+    else
+      viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+                               x, layout_x,
+                               y, layout_y
+                              ));
+  }
+  else
+  {
+    if (ctx.current_device().max_work_group_size() >= 256)
+      k.local_work_size(0, 256);
+
+    if (use_nvidia_specific)
+    {
+      k.global_work_size(0, 512 * k.local_work_size(0));
+
+      if (with_alpha_beta)
+        viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+                                 x, layout_x,
+                                 alpha,
+                                 y, layout_y,
+                                 beta
+                                ));
+      else
+        viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+                                 x, layout_x,
+                                 y, layout_y
+                                ));
+    }
+    else // use CSR adaptive:
+    {
+      k.global_work_size(0, A.blocks1() * k.local_work_size(0));
+
+      if (with_alpha_beta)
+        viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+                                 x, layout_x,
+                                 alpha,
+                                 y, layout_y,
+                                 beta
+                                ));
+      else
+        viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+                                 x, layout_x,
+                                 y, layout_y
+                                ));
+    }
+  }
+}
+
+
+/** @brief Carries out sparse_matrix-matrix multiplication first matrix being compressed
+*
+* Implementation of the convenience expression y = prod(sp_A, d_A);
+*
+* @param sp_A     The sparse matrix
+* @param d_A      The dense matrix
+* @param y        The y matrix
+*/
+template< typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_A,
+               const viennacl::matrix_base<NumericT> & d_A,
+                     viennacl::matrix_base<NumericT> & y) {
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(),
+                                             detail::sparse_dense_matmult_kernel_name(false, d_A.row_major(), y.row_major()));
+
+  viennacl::ocl::enqueue(k(sp_A.handle1().opencl_handle(), sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(d_A),
+                           cl_uint(viennacl::traits::start1(d_A)),          cl_uint(viennacl::traits::start2(d_A)),
+                           cl_uint(viennacl::traits::stride1(d_A)),         cl_uint(viennacl::traits::stride2(d_A)),
+                           cl_uint(viennacl::traits::size1(d_A)),           cl_uint(viennacl::traits::size2(d_A)),
+                           cl_uint(viennacl::traits::internal_size1(d_A)),  cl_uint(viennacl::traits::internal_size2(d_A)),
+                           viennacl::traits::opencl_handle(y),
+                           cl_uint(viennacl::traits::start1(y)),         cl_uint(viennacl::traits::start2(y)),
+                           cl_uint(viennacl::traits::stride1(y)),        cl_uint(viennacl::traits::stride2(y)),
+                           cl_uint(viennacl::traits::size1(y)),          cl_uint(viennacl::traits::size2(y)),
+                           cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y)) ));
+}
+
+/** @brief Carries out matrix-trans(matrix) multiplication first matrix being compressed
+*          and the second transposed
+*
+* Implementation of the convenience expression y = prod(sp_A, d_A);
+*
+* @param sp_A             The sparse matrix
+* @param d_A              The transposed dense matrix
+* @param y                The y matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & sp_A,
+               viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+                                            const viennacl::matrix_base<NumericT>,
+                                            viennacl::op_trans > const & d_A,
+               viennacl::matrix_base<NumericT> & y) {
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(),
+                                             detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
+
+  viennacl::ocl::enqueue(k(sp_A.handle1().opencl_handle(), sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(d_A.lhs()),
+                           cl_uint(viennacl::traits::start1(d_A.lhs())),          cl_uint(viennacl::traits::start2(d_A.lhs())),
+                           cl_uint(viennacl::traits::stride1(d_A.lhs())),         cl_uint(viennacl::traits::stride2(d_A.lhs())),
+                           cl_uint(viennacl::traits::size1(d_A.lhs())),           cl_uint(viennacl::traits::size2(d_A.lhs())),
+                           cl_uint(viennacl::traits::internal_size1(d_A.lhs())),  cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
+                           viennacl::traits::opencl_handle(y),
+                           cl_uint(viennacl::traits::start1(y)),         cl_uint(viennacl::traits::start2(y)),
+                           cl_uint(viennacl::traits::stride1(y)),        cl_uint(viennacl::traits::stride2(y)),
+                           cl_uint(viennacl::traits::size1(y)),          cl_uint(viennacl::traits::size2(y)),
+                           cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y)) ) );
+}
+
+/** @brief Carries out sparse_matrix-sparse_matrix multiplication for CSR matrices
+*
+* Implementation of the convenience expression C = prod(A, B);
+* Based on computing C(i, :) = A(i, :) * B via merging the respective rows of B
+*
+* @param A     Left factor
+* @param B     Right factor
+* @param C     Result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+               viennacl::compressed_matrix<NumericT, AlignmentV> const & B,
+               viennacl::compressed_matrix<NumericT, AlignmentV> & C)
+{
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+
+  /*
+   * Stage 1: Analyze sparsity pattern in order to properly allocate temporary arrays
+   *
+   * - Upper bound for the row lengths in C
+   */
+  viennacl::vector<unsigned int> upper_bound_nonzeros_per_row_A(256, ctx); // upper bound for the nonzeros per row encountered for each work group
+
+  viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "spgemm_stage1");
+  viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
+                            viennacl::traits::opencl_handle(upper_bound_nonzeros_per_row_A)
+                        )  );
+
+  upper_bound_nonzeros_per_row_A.switch_memory_context(viennacl::context(MAIN_MEMORY));
+  unsigned int * upper_bound_nonzeros_per_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(upper_bound_nonzeros_per_row_A.handle());
+
+  unsigned int max_nnz_per_row_A = 0;
+  for (std::size_t i=0; i<upper_bound_nonzeros_per_row_A.size(); ++i)
+    max_nnz_per_row_A = std::max(max_nnz_per_row_A, upper_bound_nonzeros_per_row_A_ptr[i]);
+
+  if (max_nnz_per_row_A > 32)
+  {
+    // determine augmented size:
+    unsigned int max_entries_in_G = 32;
+    if (max_nnz_per_row_A <= 256)
+      max_entries_in_G = 16;
+    if (max_nnz_per_row_A <= 64)
+      max_entries_in_G = 8;
+
+    viennacl::vector<unsigned int> exclusive_scan_helper(A.size1() + 1, viennacl::traits::context(A));
+    viennacl::ocl::kernel & k_decompose_1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "spgemm_decompose_1");
+    viennacl::ocl::enqueue(k_decompose_1(A.handle1().opencl_handle(), cl_uint(A.size1()),
+                                         cl_uint(max_entries_in_G),
+                                         viennacl::traits::opencl_handle(exclusive_scan_helper)
+                          )             );
+
+    // exclusive scan of helper array to find new size:
+    viennacl::linalg::exclusive_scan(exclusive_scan_helper);
+    unsigned int augmented_size = exclusive_scan_helper[A.size1()];
+
+    // split A = A2 * G1
+    viennacl::compressed_matrix<NumericT, AlignmentV> A2(A.size1(), augmented_size, augmented_size, viennacl::traits::context(A));
+    viennacl::compressed_matrix<NumericT, AlignmentV> G1(augmented_size, A.size2(),        A.nnz(), viennacl::traits::context(A));
+
+    // fill A2:
+    viennacl::ocl::kernel & k_fill_A2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "spgemm_A2");
+    viennacl::ocl::enqueue(k_fill_A2(A2.handle1().opencl_handle(), A2.handle2().opencl_handle(), A2.handle().opencl_handle(), cl_uint(A2.size1()),
+                                     viennacl::traits::opencl_handle(exclusive_scan_helper)
+                          )         );
+
+    // fill G1:
+    viennacl::ocl::kernel & k_fill_G1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "spgemm_G1");
+    viennacl::ocl::enqueue(k_fill_G1(G1.handle1().opencl_handle(), G1.handle2().opencl_handle(), G1.handle().opencl_handle(), cl_uint(G1.size1()),
+                                     A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), cl_uint(A.nnz()),
+                                     cl_uint(max_entries_in_G),
+                                     viennacl::traits::opencl_handle(exclusive_scan_helper)
+                          )         );
+
+    // compute tmp = G1 * B;
+    // C = A2 * tmp;
+    viennacl::compressed_matrix<NumericT, AlignmentV> tmp(G1.size1(), B.size2(), 0, viennacl::traits::context(A));
+    prod_impl(G1, B, tmp); // this runs a standard RMerge without decomposition of G1
+    prod_impl(A2, tmp, C); // this may split A2 again
+    return;
+  }
+
+
+  /*
+   * Stage 2: Determine sparsity pattern of C
+   */
+  C.resize(A.size1(), B.size2(), false);
+
+  viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "spgemm_stage2");
+  k2.local_work_size(0, 32); // run with one warp/wavefront
+  k2.global_work_size(0, 256*256*32); // make sure enough warps/wavefronts are in flight
+  viennacl::ocl::enqueue(k2(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
+                            B.handle1().opencl_handle(), B.handle2().opencl_handle(), cl_uint(B.size2()),
+                            C.handle1().opencl_handle()
+                        )  );
+
+  // exclusive scan on host to obtain row start indices:
+  viennacl::backend::typesafe_host_array<unsigned int> row_buffer(C.handle1(), C.size1() + 1);
+  viennacl::backend::memory_read(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+  unsigned int current_offset = 0;
+  for (std::size_t i=0; i<C.size1(); ++i)
+  {
+    unsigned int tmp = row_buffer[i];
+    row_buffer.set(i, current_offset);
+    current_offset += tmp;
+  }
+  row_buffer.set(C.size1(), current_offset);
+  viennacl::backend::memory_write(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+
+
+  /*
+   * Stage 3: Compute entries in C
+   */
+
+  C.reserve(current_offset, false);
+
+  viennacl::ocl::kernel & k3 = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "spgemm_stage3");
+  k3.local_work_size(0, 32); // run with one warp/wavefront
+  k3.global_work_size(0, 256*256*32); // make sure enough warps/wavefronts are in flight
+  viennacl::ocl::enqueue(k3(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()),
+                            B.handle1().opencl_handle(), B.handle2().opencl_handle(), B.handle().opencl_handle(), cl_uint(B.size2()),
+                            C.handle1().opencl_handle(), C.handle2().opencl_handle(), C.handle().opencl_handle()
+                        )  );
+
+}
+
+// triangular solvers
+
+/** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param L    The matrix
+* @param x  The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename NumericT, unsigned int MAT_AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, MAT_AlignmentV> const & L,
+                   vector_base<NumericT> & x,
+                   viennacl::linalg::unit_lower_tag)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "unit_lu_forward");
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, k.local_work_size());
+  viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(x),
+                           cl_uint(L.size1())
+                          )
+                        );
+}
+
+/** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param L    The matrix
+* @param x  The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & L,
+                   vector_base<NumericT> & x,
+                   viennacl::linalg::lower_tag)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "lu_forward");
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, k.local_work_size());
+  viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(x),
+                           cl_uint(L.size1())
+                          )
+                        );
+}
+
+
+/** @brief Inplace solution of an upper triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param U    The matrix
+* @param x  The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & U,
+                   vector_base<NumericT> & x,
+                   viennacl::linalg::unit_upper_tag)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "unit_lu_backward");
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, k.local_work_size());
+  viennacl::ocl::enqueue(k(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(x),
+                           cl_uint(U.size1())
+                          )
+                        );
+}
+
+/** @brief Inplace solution of an upper triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param U    The matrix
+* @param x  The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & U,
+                   vector_base<NumericT> & x,
+                   viennacl::linalg::upper_tag)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "lu_backward");
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, k.local_work_size());
+  viennacl::ocl::enqueue(k(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(x),
+                           cl_uint(U.size1())
+                          )
+                        );
+}
+
+
+
+
+
+// transposed triangular solvers
+
+namespace detail
+{
+  //
+  // block solves
+  //
+  template<typename NumericT, unsigned int AlignmentV>
+  void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+                                                   const compressed_matrix<NumericT, AlignmentV>,
+                                                   op_trans> & L,
+                           viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+                           vector_base<NumericT> const & /* L_diagonal */,  //ignored
+                           vector_base<NumericT> & x,
+                           viennacl::linalg::unit_lower_tag)
+  {
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L.lhs()).context());
+    viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+    viennacl::ocl::kernel & block_solve_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "block_trans_unit_lu_forward");
+    block_solve_kernel.global_work_size(0, num_blocks * block_solve_kernel.local_work_size(0));
+
+    viennacl::ocl::enqueue(block_solve_kernel(L.lhs().handle1().opencl_handle(),
+                                              L.lhs().handle2().opencl_handle(),
+                                              L.lhs().handle().opencl_handle(),
+                                              block_indices.opencl_handle(),
+                                              x,
+                                              static_cast<cl_uint>(x.size())));
+  }
+
+
+  template<typename NumericT, unsigned int AlignmentV>
+  void block_inplace_solve(matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+                                             const compressed_matrix<NumericT, AlignmentV>,
+                                             op_trans> const & U,
+                           viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+                           vector_base<NumericT> const & U_diagonal,
+                           vector_base<NumericT>       & x,
+                           viennacl::linalg::upper_tag)
+  {
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U.lhs()).context());
+    viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+    viennacl::ocl::kernel & block_solve_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "block_trans_lu_backward");
+    block_solve_kernel.global_work_size(0, num_blocks * block_solve_kernel.local_work_size(0));
+
+    viennacl::ocl::enqueue(block_solve_kernel(U.lhs().handle1().opencl_handle(),
+                                              U.lhs().handle2().opencl_handle(),
+                                              U.lhs().handle().opencl_handle(),
+                                              U_diagonal,
+                                              block_indices.opencl_handle(),
+                                              x,
+                                              static_cast<cl_uint>(x.size())));
+  }
+
+
+}
+
+
+/** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param proxy_L  The transposed matrix proxy
+* @param x      The vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+                                      const compressed_matrix<NumericT, AlignmentV>,
+                                      op_trans> const & proxy_L,
+                   vector_base<NumericT> & x,
+                   viennacl::linalg::unit_lower_tag)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_L.lhs()).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "trans_unit_lu_forward");
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, k.local_work_size());
+  viennacl::ocl::enqueue(k(proxy_L.lhs().handle1().opencl_handle(), proxy_L.lhs().handle2().opencl_handle(), proxy_L.lhs().handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(x),
+                           cl_uint(proxy_L.lhs().size1())
+                          )
+                        );
+}
+
+
+/** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param proxy_L  The transposed matrix proxy
+* @param x      The vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+                                      const compressed_matrix<NumericT, AlignmentV>,
+                                      op_trans> const & proxy_L,
+                   vector_base<NumericT> & x,
+                   viennacl::linalg::lower_tag)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_L.lhs()).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+
+  viennacl::vector<NumericT> diagonal(x.size());
+  detail::row_info(proxy_L.lhs(), diagonal, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "trans_lu_forward");
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, k.local_work_size());
+  viennacl::ocl::enqueue(k(proxy_L.lhs().handle1().opencl_handle(), proxy_L.lhs().handle2().opencl_handle(), proxy_L.lhs().handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(diagonal),
+                           viennacl::traits::opencl_handle(x),
+                           cl_uint(proxy_L.lhs().size1())
+                          )
+                        );
+}
+
+/** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param proxy_U  The transposed matrix proxy
+* @param x      The vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+                                      const compressed_matrix<NumericT, AlignmentV>,
+                                      op_trans> const & proxy_U,
+                   vector_base<NumericT> & x,
+                   viennacl::linalg::unit_upper_tag)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_U.lhs()).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "trans_unit_lu_backward");
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, k.local_work_size());
+  viennacl::ocl::enqueue(k(proxy_U.lhs().handle1().opencl_handle(), proxy_U.lhs().handle2().opencl_handle(), proxy_U.lhs().handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(x),
+                           cl_uint(proxy_U.lhs().size1())
+                          )
+                        );
+}
+
+
+/** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param proxy_U  The transposed matrix proxy
+* @param x      The vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+                                      const compressed_matrix<NumericT, AlignmentV>,
+                                      op_trans> const & proxy_U,
+                   vector_base<NumericT> & x,
+                   viennacl::linalg::upper_tag)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_U.lhs()).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+
+  viennacl::vector<NumericT> diagonal(x.size());
+  detail::row_info(proxy_U.lhs(), diagonal, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "trans_lu_backward");
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, k.local_work_size());
+  viennacl::ocl::enqueue(k(proxy_U.lhs().handle1().opencl_handle(), proxy_U.lhs().handle2().opencl_handle(), proxy_U.lhs().handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(diagonal),
+                           viennacl::traits::opencl_handle(x),
+                           cl_uint(proxy_U.lhs().size1())
+                          )
+                        );
+}
+
+
+//
+// Compressed Compressed matrix
+//
+
+/** @brief Carries out matrix-vector multiplication with a compressed_compressed_matrix
+*
+* Implementation of the convenience expression y = prod(A, x);
+*
+* @param A    The matrix
+* @param x    The vector
+* @param y the result vector
+*/
+template<typename NumericT>
+void prod_impl(viennacl::compressed_compressed_matrix<NumericT> const & A,
+               viennacl::vector_base<NumericT> const & x,
+               NumericT alpha,
+               viennacl::vector_base<NumericT>       & y,
+               NumericT beta)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::compressed_compressed_matrix<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_compressed_matrix<NumericT>::program_name(), "vec_mul");
+
+  if (beta < 0 || beta > 0) // multiply by beta
+    viennacl::linalg::opencl::av(y, y, beta, 1, false, false);
+  else
+    y.clear();
+
+  viennacl::ocl::packed_cl_uint layout_x;
+  layout_x.start  = cl_uint(viennacl::traits::start(x));
+  layout_x.stride = cl_uint(viennacl::traits::stride(x));
+  layout_x.size   = cl_uint(viennacl::traits::size(x));
+  layout_x.internal_size   = cl_uint(viennacl::traits::internal_size(x));
+
+  viennacl::ocl::packed_cl_uint layout_y;
+  layout_y.start  = cl_uint(viennacl::traits::start(y));
+  layout_y.stride = cl_uint(viennacl::traits::stride(y));
+  layout_y.size   = cl_uint(viennacl::traits::size(y));
+  layout_y.internal_size   = cl_uint(viennacl::traits::internal_size(y));
+
+  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle3().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.nnz1()),
+                           x, layout_x,
+                           alpha,
+                           y, layout_y,
+                           beta
+                          ));
+}
+
+
+//
+// Coordinate matrix
+//
+
+namespace detail
+{
+  template<typename NumericT, unsigned int AlignmentV>
+  void row_info(coordinate_matrix<NumericT, AlignmentV> const & A,
+                vector_base<NumericT> & x,
+                viennacl::linalg::detail::row_info_types info_selector)
+  {
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+    viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::init(ctx);
+    viennacl::ocl::kernel & row_info_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::program_name(), "row_info_extractor");
+    unsigned int thread_num = 128; //k.local_work_size(0);
+
+    row_info_kernel.local_work_size(0, thread_num);
+
+    row_info_kernel.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+    viennacl::ocl::enqueue(row_info_kernel(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+                                           viennacl::traits::opencl_handle(x),
+                                           cl_uint(info_selector),
+                                           viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
+                                           viennacl::ocl::local_mem(sizeof(NumericT)*thread_num)) );
+  }
+}
+
+/** @brief Carries out matrix-vector multiplication with a coordinate_matrix
+*
+* Implementation of the convenience expression y = prod(A, x);
+*
+* @param A    The matrix
+* @param x    The vector
+* @param y the result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::coordinate_matrix<NumericT, AlignmentV> const & A,
+               viennacl::vector_base<NumericT> const & x,
+               NumericT alpha,
+               viennacl::vector_base<NumericT>       & y,
+               NumericT beta)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::init(ctx);
+
+  if (beta < 0 || beta > 0) // multiply by beta
+    viennacl::linalg::opencl::av(y, y, beta, 1, false, false);
+  else
+    y.clear();
+
+  viennacl::ocl::packed_cl_uint layout_x;
+  layout_x.start  = cl_uint(viennacl::traits::start(x));
+  layout_x.stride = cl_uint(viennacl::traits::stride(x));
+  layout_x.size   = cl_uint(viennacl::traits::size(x));
+  layout_x.internal_size   = cl_uint(viennacl::traits::internal_size(x));
+
+  viennacl::ocl::packed_cl_uint layout_y;
+  layout_y.start  = cl_uint(viennacl::traits::start(y));
+  layout_y.stride = cl_uint(viennacl::traits::stride(y));
+  layout_y.size   = cl_uint(viennacl::traits::size(y));
+  layout_y.internal_size   = cl_uint(viennacl::traits::internal_size(y));
+
+  //std::cout << "prod(coordinate_matrix" << AlignmentV << ", vector) called with internal_nnz=" << A.internal_nnz() << std::endl;
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::program_name(), "vec_mul");
+  unsigned int thread_num = 128; //k.local_work_size(0);
+
+  k.local_work_size(0, thread_num);
+
+  k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+  //k.global_work_size(0, thread_num);  //Only one work group
+  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+                           viennacl::traits::opencl_handle(x),
+                           layout_x,
+                           alpha,
+                           viennacl::traits::opencl_handle(y),
+                           layout_y,
+                           beta,
+                           viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
+                           viennacl::ocl::local_mem(sizeof(NumericT)*thread_num)) );
+
+}
+
+
+/** @brief Carries out sparse-matrix-dense-matrix multiplication, where the sparse matrix is a coordinate_matrix
+*
+* Implementation of the convenience expression y = prod(A, B); with A being sparse (COO) and B being dense
+*
+* @param A    The sparse matrix (COO forA)
+* @param d_A  The dense matrix
+* @param y the result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::coordinate_matrix<NumericT, AlignmentV> const & A,
+               viennacl::matrix_base<NumericT> const & d_A,
+               viennacl::matrix_base<NumericT>       & y)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::program_name(),
+                                             detail::sparse_dense_matmult_kernel_name(false, d_A.row_major(), y.row_major()));
+
+  y.clear();
+
+  unsigned int thread_num = 128; //k.local_work_size(0);
+  k.local_work_size(0, thread_num);
+  k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+
+  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+                           viennacl::traits::opencl_handle(d_A),
+                           cl_uint(viennacl::traits::start1(d_A)),          cl_uint(viennacl::traits::start2(d_A)),
+                           cl_uint(viennacl::traits::stride1(d_A)),         cl_uint(viennacl::traits::stride2(d_A)),
+                           cl_uint(viennacl::traits::size1(d_A)),           cl_uint(viennacl::traits::size2(d_A)),
+                           cl_uint(viennacl::traits::internal_size1(d_A)),  cl_uint(viennacl::traits::internal_size2(d_A)),
+                           viennacl::traits::opencl_handle(y),
+                           cl_uint(viennacl::traits::start1(y)),         cl_uint(viennacl::traits::start2(y)),
+                           cl_uint(viennacl::traits::stride1(y)),        cl_uint(viennacl::traits::stride2(y)),
+                           cl_uint(viennacl::traits::size1(y)),          cl_uint(viennacl::traits::size2(y)),
+                           cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y)),
+                           viennacl::ocl::local_mem(sizeof(cl_uint)*k.local_work_size(0)),
+                           viennacl::ocl::local_mem(sizeof(NumericT)*k.local_work_size(0))) );
+
+}
+
+/** @brief Carries out sparse-matrix-dense-matrix multiplication, where the sparse matrix is a coordinate_matrix
+*
+* Implementation of the convenience expression y = prod(A, trans(B)); with A being sparse (COO) and B being dense
+*
+* @param A    The sparse matrix (COO forA)
+* @param d_A  The dense matrix
+* @param y the result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::coordinate_matrix<NumericT, AlignmentV> const & A,
+               viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+                                            const viennacl::matrix_base<NumericT>,
+                                            viennacl::op_trans > const & d_A,
+               viennacl::matrix_base<NumericT> & y)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::program_name(),
+                                             detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
+
+  y.clear();
+
+  unsigned int thread_num = 128; //k.local_work_size(0);
+  k.local_work_size(0, thread_num);
+  k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+
+  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+                           viennacl::traits::opencl_handle(d_A),
+                           cl_uint(viennacl::traits::start1(d_A.lhs())),          cl_uint(viennacl::traits::start2(d_A.lhs())),
+                           cl_uint(viennacl::traits::stride1(d_A.lhs())),         cl_uint(viennacl::traits::stride2(d_A.lhs())),
+                           cl_uint(viennacl::traits::size1(d_A.lhs())),           cl_uint(viennacl::traits::size2(d_A.lhs())),
+                           cl_uint(viennacl::traits::internal_size1(d_A.lhs())),  cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
+                           viennacl::traits::opencl_handle(y),
+                           cl_uint(viennacl::traits::start1(y)),         cl_uint(viennacl::traits::start2(y)),
+                           cl_uint(viennacl::traits::stride1(y)),        cl_uint(viennacl::traits::stride2(y)),
+                           cl_uint(viennacl::traits::size1(y)),          cl_uint(viennacl::traits::size2(y)),
+                           cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y)),
+                           viennacl::ocl::local_mem(sizeof(cl_uint)*k.local_work_size(0)),
+                           viennacl::ocl::local_mem(sizeof(NumericT)*k.local_work_size(0))) );
+
+}
+
+
+//
+// ELL Matrix
+//
+
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::ell_matrix<NumericT, AlignmentV> const & A,
+               viennacl::vector_base<NumericT> const & x,
+               NumericT alpha,
+               viennacl::vector_base<NumericT>       & y,
+               NumericT beta)
+{
+  assert(A.size1() == y.size());
+  assert(A.size2() == x.size());
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::ell_matrix<NumericT>::init(ctx);
+
+  bool with_alpha_beta = (alpha < NumericT(1) || alpha > NumericT(1)) || (beta < 0 || beta > 0);
+
+  viennacl::ocl::packed_cl_uint layout_x;
+  layout_x.start  = cl_uint(viennacl::traits::start(x));
+  layout_x.stride = cl_uint(viennacl::traits::stride(x));
+  layout_x.size   = cl_uint(viennacl::traits::size(x));
+  layout_x.internal_size   = cl_uint(viennacl::traits::internal_size(x));
+
+  viennacl::ocl::packed_cl_uint layout_y;
+  layout_y.start  = cl_uint(viennacl::traits::start(y));
+  layout_y.stride = cl_uint(viennacl::traits::stride(y));
+  layout_y.size   = cl_uint(viennacl::traits::size(y));
+  layout_y.internal_size   = cl_uint(viennacl::traits::internal_size(y));
+
+  std::stringstream ss;
+  ss << "vec_mul_" << 1;//(AlignmentV != 1?4:1);
+  viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ell_matrix<NumericT>::program_name(), with_alpha_beta ? "vec_mul_alpha_beta" : "vec_mul");
+
+  unsigned int thread_num = 128;
+  unsigned int group_num = 256;
+
+  k.local_work_size(0, thread_num);
+  k.global_work_size(0, thread_num * group_num);
+
+  if (with_alpha_beta)
+    viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+                             A.handle().opencl_handle(),
+                             viennacl::traits::opencl_handle(x),
+                             layout_x,
+                             alpha,
+                             viennacl::traits::opencl_handle(y),
+                             layout_y,
+                             beta,
+                             cl_uint(A.size1()),
+                             cl_uint(A.size2()),
+                             cl_uint(A.internal_size1()),
+                             cl_uint(A.maxnnz()),
+                             cl_uint(A.internal_maxnnz())
+                            )
+    );
+  else
+    viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+                             A.handle().opencl_handle(),
+                             viennacl::traits::opencl_handle(x),
+                             layout_x,
+                             viennacl::traits::opencl_handle(y),
+                             layout_y,
+                             cl_uint(A.size1()),
+                             cl_uint(A.size2()),
+                             cl_uint(A.internal_size1()),
+                             cl_uint(A.maxnnz()),
+                             cl_uint(A.internal_maxnnz())
+                            )
+    );
+
+
+}
+
+/** @brief Carries out Sparse Matrix(ELL)-Dense Matrix multiplication
+*
+* Implementation of the convenience expression y = prod(sp_A, d_A);
+* sp_mat being in ELL format
+*
+* @param sp_A     The sparse matrix (ELL)
+* @param d_A      The dense matrix
+* @param y        The y matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::ell_matrix<NumericT, AlignmentV> const & sp_A,
+               viennacl::matrix_base<NumericT> const & d_A,
+               viennacl::matrix_base<NumericT>       & y) {
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
+  viennacl::linalg::opencl::kernels::ell_matrix<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ell_matrix<NumericT>::program_name(),
+                                             detail::sparse_dense_matmult_kernel_name(false, d_A.row_major(), y.row_major()));
+
+  //unsigned int thread_num = 128;
+  //unsigned int group_num = 256;
+  //
+  //k.local_work_size(0, thread_num);
+  //k.global_work_size(0, thread_num * group_num);
+
+  viennacl::ocl::enqueue(k(sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
+                           cl_uint(sp_A.size1()),
+                           cl_uint(sp_A.size2()),
+                           cl_uint(sp_A.internal_size1()),
+                           cl_uint(sp_A.maxnnz()),
+                           cl_uint(sp_A.internal_maxnnz()),
+                           viennacl::traits::opencl_handle(d_A),
+                           cl_uint(viennacl::traits::start1(d_A)),          cl_uint(viennacl::traits::start2(d_A)),
+                           cl_uint(viennacl::traits::stride1(d_A)),         cl_uint(viennacl::traits::stride2(d_A)),
+                           cl_uint(viennacl::traits::size1(d_A)),           cl_uint(viennacl::traits::size2(d_A)),
+                           cl_uint(viennacl::traits::internal_size1(d_A)),  cl_uint(viennacl::traits::internal_size2(d_A)),
+                           viennacl::traits::opencl_handle(y),
+                           cl_uint(viennacl::traits::start1(y)),         cl_uint(viennacl::traits::start2(y)),
+                           cl_uint(viennacl::traits::stride1(y)),        cl_uint(viennacl::traits::stride2(y)),
+                           cl_uint(viennacl::traits::size1(y)),          cl_uint(viennacl::traits::size2(y)),
+                           cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y))
+                          )
+                        );
+}
+
+/** @brief Carries out Sparse Matrix(ELL)-Dense Transposed Matrix multiplication
+*
+* Implementation of the convenience expression y = prod(sp_A, trans(d_A));
+* sp_mat being in ELL format
+*
+* @param sp_A     The sparse matrix (ELL)
+* @param d_A      The dense transposed matrix
+* @param y        The y matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::ell_matrix<NumericT, AlignmentV> const & sp_A,
+               viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+                                            const viennacl::matrix_base<NumericT>,
+                                            viennacl::op_trans > const & d_A,
+               viennacl::matrix_base<NumericT> & y) {
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
+  viennacl::linalg::opencl::kernels::ell_matrix<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ell_matrix<NumericT>::program_name(),
+                                             detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
+
+  //unsigned int thread_num = 128;
+  //unsigned int group_num = 256;
+  //
+  //k.local_work_size(0, thread_num);
+  //k.global_work_size(0, thread_num * group_num);
+
+  viennacl::ocl::enqueue(k(sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
+                           cl_uint(sp_A.size1()),
+                           cl_uint(sp_A.size2()),
+                           cl_uint(sp_A.internal_size1()),
+                           cl_uint(sp_A.maxnnz()),
+                           cl_uint(sp_A.internal_maxnnz()),
+                           viennacl::traits::opencl_handle(d_A.lhs()),
+                           cl_uint(viennacl::traits::start1(d_A.lhs())),          cl_uint(viennacl::traits::start2(d_A.lhs())),
+                           cl_uint(viennacl::traits::stride1(d_A.lhs())),         cl_uint(viennacl::traits::stride2(d_A.lhs())),
+                           cl_uint(viennacl::traits::size1(d_A.lhs())),           cl_uint(viennacl::traits::size2(d_A.lhs())),
+                           cl_uint(viennacl::traits::internal_size1(d_A.lhs())),  cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
+                           viennacl::traits::opencl_handle(y),
+                           cl_uint(viennacl::traits::start1(y)),         cl_uint(viennacl::traits::start2(y)),
+                           cl_uint(viennacl::traits::stride1(y)),        cl_uint(viennacl::traits::stride2(y)),
+                           cl_uint(viennacl::traits::size1(y)),          cl_uint(viennacl::traits::size2(y)),
+                           cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y))
+                          )
+                        );
+}
+
+//
+// SELL-C-\sigma Matrix
+//
+
+template<typename ScalarT, typename IndexT>
+void prod_impl(viennacl::sliced_ell_matrix<ScalarT, IndexT> const & A,
+               viennacl::vector_base<ScalarT> const & x,
+               ScalarT alpha,
+               viennacl::vector_base<ScalarT>       & y,
+               ScalarT beta)
+{
+  assert(A.size1() == y.size());
+  assert(A.size2() == x.size());
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::sliced_ell_matrix<ScalarT, unsigned int>::init(ctx);
+
+  bool with_alpha_beta = (alpha < ScalarT(1) || alpha > ScalarT(1)) || (beta < 0 || beta > 0);
+
+  viennacl::ocl::packed_cl_uint layout_x;
+  layout_x.start  = cl_uint(viennacl::traits::start(x));
+  layout_x.stride = cl_uint(viennacl::traits::stride(x));
+  layout_x.size   = cl_uint(viennacl::traits::size(x));
+  layout_x.internal_size   = cl_uint(viennacl::traits::internal_size(x));
+
+  viennacl::ocl::packed_cl_uint layout_y;
+  layout_y.start  = cl_uint(viennacl::traits::start(y));
+  layout_y.stride = cl_uint(viennacl::traits::stride(y));
+  layout_y.size   = cl_uint(viennacl::traits::size(y));
+  layout_y.internal_size   = cl_uint(viennacl::traits::internal_size(y));
+
+  std::stringstream ss;
+  ss << "vec_mul_" << 1;//(AlignmentV != 1?4:1);
+  viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::sliced_ell_matrix<ScalarT, IndexT>::program_name(), with_alpha_beta ? "vec_mul_alpha_beta" : "vec_mul");
+
+  vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
+  unsigned int group_num = 256;
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+    thread_num = 256;
+
+  k.local_work_size(0, thread_num);
+  k.global_work_size(0, thread_num * group_num);
+
+  if (with_alpha_beta)
+    viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
+                             A.handle2().opencl_handle(),
+                             A.handle3().opencl_handle(),
+                             A.handle().opencl_handle(),
+                             viennacl::traits::opencl_handle(x),
+                             layout_x,
+                             alpha,
+                             viennacl::traits::opencl_handle(y),
+                             layout_y,
+                             beta,
+                             cl_uint(A.rows_per_block()))
+    );
+  else
+    viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
+                             A.handle2().opencl_handle(),
+                             A.handle3().opencl_handle(),
+                             A.handle().opencl_handle(),
+                             viennacl::traits::opencl_handle(x),
+                             layout_x,
+                             viennacl::traits::opencl_handle(y),
+                             layout_y,
+                             cl_uint(A.rows_per_block()))
+    );
+}
+
+
+//
+// Hybrid Matrix
+//
+
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::hyb_matrix<NumericT, AlignmentV> const & A,
+               viennacl::vector_base<NumericT> const & x,
+               NumericT alpha,
+               viennacl::vector_base<NumericT>       & y,
+               NumericT beta)
+{
+  assert(A.size1() == y.size());
+  assert(A.size2() == x.size());
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::init(ctx);
+
+  bool with_alpha_beta = (alpha < NumericT(1) || alpha > NumericT(1)) || (beta < 0 || beta > 0);
+
+  viennacl::ocl::packed_cl_uint layout_x;
+  layout_x.start  = cl_uint(viennacl::traits::start(x));
+  layout_x.stride = cl_uint(viennacl::traits::stride(x));
+  layout_x.size   = cl_uint(viennacl::traits::size(x));
+  layout_x.internal_size   = cl_uint(viennacl::traits::internal_size(x));
+
+  viennacl::ocl::packed_cl_uint layout_y;
+  layout_y.start  = cl_uint(viennacl::traits::start(y));
+  layout_y.stride = cl_uint(viennacl::traits::stride(y));
+  layout_y.size   = cl_uint(viennacl::traits::size(y));
+  layout_y.internal_size   = cl_uint(viennacl::traits::internal_size(y));
+
+  viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::program_name(), with_alpha_beta ? "vec_mul_alpha_beta" : "vec_mul");
+
+  if (with_alpha_beta)
+    viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+                             A.handle().opencl_handle(),
+                             A.handle3().opencl_handle(),
+                             A.handle4().opencl_handle(),
+                             A.handle5().opencl_handle(),
+                             viennacl::traits::opencl_handle(x),
+                             layout_x,
+                             alpha,
+                             viennacl::traits::opencl_handle(y),
+                             layout_y,
+                             beta,
+                             cl_uint(A.size1()),
+                             cl_uint(A.internal_size1()),
+                             cl_uint(A.ell_nnz()),
+                             cl_uint(A.internal_ellnnz())
+                            )
+    );
+  else
+    viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+                             A.handle().opencl_handle(),
+                             A.handle3().opencl_handle(),
+                             A.handle4().opencl_handle(),
+                             A.handle5().opencl_handle(),
+                             viennacl::traits::opencl_handle(x),
+                             layout_x,
+                             viennacl::traits::opencl_handle(y),
+                             layout_y,
+                             cl_uint(A.size1()),
+                             cl_uint(A.internal_size1()),
+                             cl_uint(A.ell_nnz()),
+                             cl_uint(A.internal_ellnnz())
+                            )
+    );
+}
+
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::hyb_matrix<NumericT, AlignmentV> const & A,
+               viennacl::matrix_base<NumericT> const & d_A,
+               viennacl::matrix_base<NumericT>       & y)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::program_name(),
+                                             detail::sparse_dense_matmult_kernel_name(false, d_A.row_major(), y.row_major()));
+
+  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+                           A.handle().opencl_handle(),
+                           A.handle3().opencl_handle(),
+                           A.handle4().opencl_handle(),
+                           A.handle5().opencl_handle(),
+                           cl_uint(A.size1()),
+                           cl_uint(A.internal_size1()),
+                           cl_uint(A.ell_nnz()),
+                           cl_uint(A.internal_ellnnz()),
+                           viennacl::traits::opencl_handle(d_A),
+                           cl_uint(viennacl::traits::start1(d_A)),          cl_uint(viennacl::traits::start2(d_A)),
+                           cl_uint(viennacl::traits::stride1(d_A)),         cl_uint(viennacl::traits::stride2(d_A)),
+                           cl_uint(viennacl::traits::size1(d_A)),           cl_uint(viennacl::traits::size2(d_A)),
+                           cl_uint(viennacl::traits::internal_size1(d_A)),  cl_uint(viennacl::traits::internal_size2(d_A)),
+                           viennacl::traits::opencl_handle(y),
+                           cl_uint(viennacl::traits::start1(y)),         cl_uint(viennacl::traits::start2(y)),
+                           cl_uint(viennacl::traits::stride1(y)),        cl_uint(viennacl::traits::stride2(y)),
+                           cl_uint(viennacl::traits::size1(y)),          cl_uint(viennacl::traits::size2(y)),
+                           cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y))
+                          )
+  );
+}
+
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::hyb_matrix<NumericT, AlignmentV> const & A,
+               viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+                                            const viennacl::matrix_base<NumericT>,
+                                            viennacl::op_trans > const & d_A,
+               viennacl::matrix_base<NumericT> & y)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::program_name(),
+                                             detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
+
+  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+                           A.handle().opencl_handle(),
+                           A.handle3().opencl_handle(),
+                           A.handle4().opencl_handle(),
+                           A.handle5().opencl_handle(),
+                           cl_uint(A.size1()),
+                           cl_uint(A.internal_size1()),
+                           cl_uint(A.ell_nnz()),
+                           cl_uint(A.internal_ellnnz()),
+                           viennacl::traits::opencl_handle(d_A.lhs()),
+                           cl_uint(viennacl::traits::start1(d_A.lhs())),          cl_uint(viennacl::traits::start2(d_A.lhs())),
+                           cl_uint(viennacl::traits::stride1(d_A.lhs())),         cl_uint(viennacl::traits::stride2(d_A.lhs())),
+                           cl_uint(viennacl::traits::size1(d_A.lhs())),           cl_uint(viennacl::traits::size2(d_A.lhs())),
+                           cl_uint(viennacl::traits::internal_size1(d_A.lhs())),  cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
+                           viennacl::traits::opencl_handle(y),
+                           cl_uint(viennacl::traits::start1(y)),         cl_uint(viennacl::traits::start2(y)),
+                           cl_uint(viennacl::traits::stride1(y)),        cl_uint(viennacl::traits::stride2(y)),
+                           cl_uint(viennacl::traits::size1(y)),          cl_uint(viennacl::traits::size2(y)),
+                           cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y))
+                          )
+  );
+}
+
+
+} // namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vandermonde_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vandermonde_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vandermonde_matrix_operations.hpp
new file mode 100644
index 0000000..6a25d81
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vandermonde_matrix_operations.hpp
@@ -0,0 +1,68 @@
+#ifndef VIENNACL_LINALG_OPENCL_VANDERMONDE_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_VANDERMONDE_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/vandermonde_matrix_operations.hpp
+    @brief Implementations of operations using vandermonde_matrix
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/fft.hpp"
+#include "viennacl/linalg/opencl/kernels/fft.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+/** @brief Carries out matrix-vector multiplication with a vandermonde_matrix
+*
+* Implementation of the convenience expression y = prod(A, x);
+*
+* @param A    The Vandermonde matrix
+* @param x    The vector
+* @param y    The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::vandermonde_matrix<NumericT, AlignmentV> const & A,
+               viennacl::vector_base<NumericT> const & x,
+               viennacl::vector_base<NumericT>       & y)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel & kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "vandermonde_prod");
+  viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(A),
+                                viennacl::traits::opencl_handle(x),
+                                viennacl::traits::opencl_handle(y),
+                                static_cast<cl_uint>(A.size1())));
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif


[02/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vector_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vector_operations.hpp
new file mode 100644
index 0000000..cd04482
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vector_operations.hpp
@@ -0,0 +1,1263 @@
+#ifndef VIENNACL_LINALG_OPENCL_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_VECTOR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/vector_operations.hpp
+    @brief Implementations of vector operations using OpenCL
+*/
+
+#include <cmath>
+
+#include "viennacl/forwards.h"
+#include "viennacl/detail/vector_def.hpp"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/vector.hpp"
+#include "viennacl/linalg/opencl/kernels/vector_element.hpp"
+#include "viennacl/linalg/opencl/kernels/scan.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+template<typename DestNumericT, typename SrcNumericT>
+void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src)
+{
+  assert(viennacl::traits::opencl_handle(dest).context() == viennacl::traits::opencl_handle(src).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  std::string kernel_name("convert_");
+  kernel_name += viennacl::ocl::type_to_string<DestNumericT>::apply();
+  kernel_name += "_";
+  kernel_name += viennacl::ocl::type_to_string<SrcNumericT>::apply();
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(dest).context());
+  viennacl::linalg::opencl::kernels::vector_convert::init(ctx);
+  viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_convert::program_name(), kernel_name);
+
+  viennacl::ocl::enqueue(k( dest, cl_uint(dest.start()), cl_uint(dest.stride()), cl_uint(dest.size()),
+                            src,  cl_uint( src.start()), cl_uint( src.stride())
+                        ) );
+
+}
+
+template <typename T, typename ScalarType1>
+void av(vector_base<T> & vec1,
+        vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+  viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(),
+                                             (viennacl::is_cpu_scalar<ScalarType1>::value ? "av_cpu" : "av_gpu"));
+  k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
+                                              viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
+
+  viennacl::ocl::packed_cl_uint size_vec1;
+  size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
+  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+  size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
+  size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
+
+  viennacl::ocl::packed_cl_uint size_vec2;
+  size_vec2.start  = cl_uint(viennacl::traits::start(vec2));
+  size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
+  size_vec2.size   = cl_uint(viennacl::traits::size(vec2));
+  size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(vec2));
+
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                           size_vec1,
+
+                           viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
+                           options_alpha,
+                           viennacl::traits::opencl_handle(vec2),
+                           size_vec2 )
+                        );
+}
+
+
+template <typename T, typename ScalarType1, typename ScalarType2>
+void avbv(vector_base<T> & vec1,
+          vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+          vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+  assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+  viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+  std::string kernel_name;
+  if (viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+    kernel_name = "avbv_cpu_cpu";
+  else if (viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
+    kernel_name = "avbv_cpu_gpu";
+  else if (!viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+    kernel_name = "avbv_gpu_cpu";
+  else
+    kernel_name = "avbv_gpu_gpu";
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), kernel_name);
+  k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
+                                              viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
+
+  viennacl::ocl::packed_cl_uint size_vec1;
+  size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
+  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+  size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
+  size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
+
+  viennacl::ocl::packed_cl_uint size_vec2;
+  size_vec2.start  = cl_uint(viennacl::traits::start(vec2));
+  size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
+  size_vec2.size   = cl_uint(viennacl::traits::size(vec2));
+  size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(vec2));
+
+  viennacl::ocl::packed_cl_uint size_vec3;
+  size_vec3.start  = cl_uint(viennacl::traits::start(vec3));
+  size_vec3.stride = cl_uint(viennacl::traits::stride(vec3));
+  size_vec3.size   = cl_uint(viennacl::traits::size(vec3));
+  size_vec3.internal_size   = cl_uint(viennacl::traits::internal_size(vec3));
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                           size_vec1,
+
+                           viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
+                           options_alpha,
+                           viennacl::traits::opencl_handle(vec2),
+                           size_vec2,
+
+                           viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(beta)),
+                           options_beta,
+                           viennacl::traits::opencl_handle(vec3),
+                           size_vec3 )
+                        );
+}
+
+
+template <typename T, typename ScalarType1, typename ScalarType2>
+void avbv_v(vector_base<T> & vec1,
+            vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+            vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+  assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+  viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+  std::string kernel_name;
+  if (viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+    kernel_name = "avbv_v_cpu_cpu";
+  else if (viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
+    kernel_name = "avbv_v_cpu_gpu";
+  else if (!viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+    kernel_name = "avbv_v_gpu_cpu";
+  else
+    kernel_name = "avbv_v_gpu_gpu";
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), kernel_name);
+  k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
+                                              viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
+
+  viennacl::ocl::packed_cl_uint size_vec1;
+  size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
+  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+  size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
+  size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
+
+  viennacl::ocl::packed_cl_uint size_vec2;
+  size_vec2.start  = cl_uint(viennacl::traits::start(vec2));
+  size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
+  size_vec2.size   = cl_uint(viennacl::traits::size(vec2));
+  size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(vec2));
+
+  viennacl::ocl::packed_cl_uint size_vec3;
+  size_vec3.start  = cl_uint(viennacl::traits::start(vec3));
+  size_vec3.stride = cl_uint(viennacl::traits::stride(vec3));
+  size_vec3.size   = cl_uint(viennacl::traits::size(vec3));
+  size_vec3.internal_size   = cl_uint(viennacl::traits::internal_size(vec3));
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                           size_vec1,
+
+                           viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
+                           options_alpha,
+                           viennacl::traits::opencl_handle(vec2),
+                           size_vec2,
+
+                           viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(beta)),
+                           options_beta,
+                           viennacl::traits::opencl_handle(vec3),
+                           size_vec3 )
+                        );
+}
+
+
+/** @brief Assign a constant value to a vector (-range/-slice)
+*
+* @param vec1   The vector to which the value should be assigned
+* @param alpha  The value to be assigned
+* @param up_to_internal_size  Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
+*/
+template <typename T>
+void vector_assign(vector_base<T> & vec1, const T & alpha, bool up_to_internal_size = false)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+  viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "assign_cpu");
+  k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
+                                              viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
+
+  cl_uint size = up_to_internal_size ? cl_uint(vec1.internal_size()) : cl_uint(viennacl::traits::size(vec1));
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                           cl_uint(viennacl::traits::start(vec1)),
+                           cl_uint(viennacl::traits::stride(vec1)),
+                           size,
+                           cl_uint(vec1.internal_size()),     //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
+                           viennacl::traits::opencl_handle(T(alpha)) )
+                        );
+}
+
+
+/** @brief Swaps the contents of two vectors, data is copied
+*
+* @param vec1   The first vector (or -range, or -slice)
+* @param vec2   The second vector (or -range, or -slice)
+*/
+template <typename T>
+void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2)
+{
+  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+  viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "swap");
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                           cl_uint(viennacl::traits::start(vec1)),
+                           cl_uint(viennacl::traits::stride(vec1)),
+                           cl_uint(viennacl::traits::size(vec1)),
+                           viennacl::traits::opencl_handle(vec2),
+                           cl_uint(viennacl::traits::start(vec2)),
+                           cl_uint(viennacl::traits::stride(vec2)),
+                           cl_uint(viennacl::traits::size(vec2)))
+                        );
+}
+
+///////////////////////// Binary Elementwise operations /////////////
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3    (using MATLAB syntax)
+*
+* @param vec1   The result vector (or -range, or -slice)
+* @param proxy  The proxy object holding v2, v3 and the operation
+*/
+template <typename T, typename OP>
+void element_op(vector_base<T> & vec1,
+                vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+{
+  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+  viennacl::linalg::opencl::kernels::vector_element<T>::init(ctx);
+
+  std::string kernel_name = "element_pow";
+  cl_uint op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OP>::value)
+  {
+    op_type = 1;
+    kernel_name = "element_div";
+  }
+  else if (viennacl::is_product<OP>::value)
+  {
+    op_type = 0;
+    kernel_name = "element_prod";
+  }
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_element<T>::program_name(), kernel_name);
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                           cl_uint(viennacl::traits::start(vec1)),
+                           cl_uint(viennacl::traits::stride(vec1)),
+                           cl_uint(viennacl::traits::size(vec1)),
+
+                           viennacl::traits::opencl_handle(proxy.lhs()),
+                           cl_uint(viennacl::traits::start(proxy.lhs())),
+                           cl_uint(viennacl::traits::stride(proxy.lhs())),
+
+                           viennacl::traits::opencl_handle(proxy.rhs()),
+                           cl_uint(viennacl::traits::start(proxy.rhs())),
+                           cl_uint(viennacl::traits::stride(proxy.rhs())),
+
+                           op_type)
+                        );
+}
+
+///////////////////////// Unary Elementwise operations /////////////
+
+/** @brief Implementation of unary element-wise operations v1 = OP(v2)
+*
+* @param vec1   The result vector (or -range, or -slice)
+* @param proxy  The proxy object holding v2 and the operation
+*/
+template <typename T, typename OP>
+void element_op(vector_base<T> & vec1,
+                vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> > const & proxy)
+{
+  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+  viennacl::linalg::opencl::kernels::vector_element<T>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_element<T>::program_name(), detail::op_to_string(OP()) + "_assign");
+
+  viennacl::ocl::packed_cl_uint size_vec1;
+  size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
+  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+  size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
+  size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
+
+  viennacl::ocl::packed_cl_uint size_vec2;
+  size_vec2.start  = cl_uint(viennacl::traits::start(proxy.lhs()));
+  size_vec2.stride = cl_uint(viennacl::traits::stride(proxy.lhs()));
+  size_vec2.size   = cl_uint(viennacl::traits::size(proxy.lhs()));
+  size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(proxy.lhs()));
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                           size_vec1,
+                           viennacl::traits::opencl_handle(proxy.lhs()),
+                           size_vec2)
+                        );
+}
+
+///////////////////////// Norms and inner product ///////////////////
+
+/** @brief Computes the partial inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param partial_result The results of each group
+*/
+template <typename T>
+void inner_prod_impl(vector_base<T> const & vec1,
+                     vector_base<T> const & vec2,
+                     vector_base<T> & partial_result)
+{
+  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+  assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+  viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
+        && bool("Incompatible vector sizes in inner_prod_impl()!"));
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "inner_prod1");
+
+  assert( (k.global_work_size() / k.local_work_size() <= partial_result.size()) && bool("Size mismatch for partial reduction in inner_prod_impl()") );
+
+  viennacl::ocl::packed_cl_uint size_vec1;
+  size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
+  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+  size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
+  size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
+
+  viennacl::ocl::packed_cl_uint size_vec2;
+  size_vec2.start  = cl_uint(viennacl::traits::start(vec2));
+  size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
+  size_vec2.size   = cl_uint(viennacl::traits::size(vec2));
+  size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(vec2));
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                           size_vec1,
+                           viennacl::traits::opencl_handle(vec2),
+                           size_vec2,
+                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * k.local_work_size()),
+                           viennacl::traits::opencl_handle(partial_result)
+                          )
+                        );
+}
+
+
+//implementation of inner product:
+//namespace {
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the gpu)
+*/
+template <typename T>
+void inner_prod_impl(vector_base<T> const & vec1,
+                     vector_base<T> const & vec2,
+                     scalar<T> & result)
+{
+  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+
+  vcl_size_t work_groups = 128;
+  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
+  temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
+
+  // Step 1: Compute partial inner products for each work group:
+  inner_prod_impl(vec1, vec2, temp);
+
+  // Step 2: Sum partial results:
+  viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
+
+  ksum.global_work_size(0, ksum.local_work_size(0));
+  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                              cl_uint(viennacl::traits::start(temp)),
+                              cl_uint(viennacl::traits::stride(temp)),
+                              cl_uint(viennacl::traits::size(temp)),
+                              cl_uint(1),
+                              viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
+                              viennacl::traits::opencl_handle(result) )
+                        );
+}
+
+namespace detail
+{
+  template<typename NumericT>
+  viennacl::ocl::packed_cl_uint make_layout(vector_base<NumericT> const & vec)
+  {
+    viennacl::ocl::packed_cl_uint ret;
+    ret.start           = cl_uint(viennacl::traits::start(vec));
+    ret.stride          = cl_uint(viennacl::traits::stride(vec));
+    ret.size            = cl_uint(viennacl::traits::size(vec));
+    ret.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+    return ret;
+  }
+}
+
+/** @brief Computes multiple inner products where one argument is common to all inner products. <x, y1>, <x, y2>, ..., <x, yN>
+*
+* @param x          The common vector
+* @param vec_tuple  The tuple of vectors y1, y2, ..., yN
+* @param result     The result vector
+*/
+template <typename NumericT>
+void inner_prod_impl(vector_base<NumericT> const & x,
+                     vector_tuple<NumericT> const & vec_tuple,
+                     vector_base<NumericT> & result)
+{
+  assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+  viennacl::linalg::opencl::kernels::vector<NumericT>::init(ctx);
+  viennacl::linalg::opencl::kernels::vector_multi_inner_prod<NumericT>::init(ctx);
+
+  viennacl::ocl::packed_cl_uint layout_x = detail::make_layout(x);
+
+  viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<NumericT>::program_name(), "sum_inner_prod");
+  viennacl::ocl::kernel & inner_prod_kernel_1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<NumericT>::program_name(), "inner_prod1");
+  viennacl::ocl::kernel & inner_prod_kernel_2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<NumericT>::program_name(), "inner_prod2");
+  viennacl::ocl::kernel & inner_prod_kernel_3 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<NumericT>::program_name(), "inner_prod3");
+  viennacl::ocl::kernel & inner_prod_kernel_4 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<NumericT>::program_name(), "inner_prod4");
+  viennacl::ocl::kernel & inner_prod_kernel_8 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<NumericT>::program_name(), "inner_prod8");
+
+  vcl_size_t work_groups = inner_prod_kernel_8.global_work_size(0) / inner_prod_kernel_8.local_work_size(0);
+  viennacl::vector<NumericT> temp(8 * work_groups, viennacl::traits::context(x));
+
+  vcl_size_t current_index = 0;
+  while (current_index < vec_tuple.const_size())
+  {
+    switch (vec_tuple.const_size() - current_index)
+    {
+      case 7:
+      case 6:
+      case 5:
+      case 4:
+      {
+        vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index    );
+        vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
+        vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
+        vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
+        viennacl::ocl::enqueue(inner_prod_kernel_4( viennacl::traits::opencl_handle(x), layout_x,
+                                                   viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+                                                   viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
+                                                   viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
+                                                   viennacl::traits::opencl_handle(y3), detail::make_layout(y3),
+                                                   viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 4 * inner_prod_kernel_4.local_work_size()),
+                                                   viennacl::traits::opencl_handle(temp)
+                                                  ) );
+
+        ksum.global_work_size(0, 4 * ksum.local_work_size(0));
+        viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                                    cl_uint(work_groups),
+                                    viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 4 * ksum.local_work_size()),
+                                    viennacl::traits::opencl_handle(result),
+                                    cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+                                    cl_uint(viennacl::traits::stride(result))
+                                    )
+                              );
+      }
+        current_index += 4;
+        break;
+
+      case 3:
+      {
+        vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index    );
+        vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
+        vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
+        viennacl::ocl::enqueue(inner_prod_kernel_3( viennacl::traits::opencl_handle(x), layout_x,
+                                                    viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+                                                    viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
+                                                    viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
+                                                    viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 3 * inner_prod_kernel_3.local_work_size()),
+                                                    viennacl::traits::opencl_handle(temp)
+                                                   ) );
+
+        ksum.global_work_size(0, 3 * ksum.local_work_size(0));
+        viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                                    cl_uint(work_groups),
+                                    viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 3 * ksum.local_work_size()),
+                                    viennacl::traits::opencl_handle(result),
+                                    cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+                                    cl_uint(viennacl::traits::stride(result))
+                                    )
+                              );
+      }
+        current_index += 3;
+        break;
+
+      case 2:
+      {
+        vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index    );
+        vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
+        viennacl::ocl::enqueue(inner_prod_kernel_2( viennacl::traits::opencl_handle(x), layout_x,
+                                                    viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+                                                    viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
+                                                    viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 2 * inner_prod_kernel_2.local_work_size()),
+                                                    viennacl::traits::opencl_handle(temp)
+                                                  ) );
+
+        ksum.global_work_size(0, 2 * ksum.local_work_size(0));
+        viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                                    cl_uint(work_groups),
+                                    viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 2 * ksum.local_work_size()),
+                                    viennacl::traits::opencl_handle(result),
+                                    cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+                                    cl_uint(viennacl::traits::stride(result))
+                                    )
+                              );
+      }
+        current_index += 2;
+        break;
+
+      case 1:
+      {
+        vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index    );
+        viennacl::ocl::enqueue(inner_prod_kernel_1( viennacl::traits::opencl_handle(x), layout_x,
+                                                    viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+                                                    viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 1 * inner_prod_kernel_1.local_work_size()),
+                                                    viennacl::traits::opencl_handle(temp)
+                                                  ) );
+
+        ksum.global_work_size(0, 1 * ksum.local_work_size(0));
+        viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                                    cl_uint(work_groups),
+                                    viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 1 * ksum.local_work_size()),
+                                    viennacl::traits::opencl_handle(result),
+                                    cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+                                    cl_uint(viennacl::traits::stride(result))
+                                    )
+                              );
+      }
+        current_index += 1;
+        break;
+
+      default: //8 or more vectors
+      {
+        vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index    );
+        vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
+        vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
+        vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
+        vector_base<NumericT> const & y4 = vec_tuple.const_at(current_index + 4);
+        vector_base<NumericT> const & y5 = vec_tuple.const_at(current_index + 5);
+        vector_base<NumericT> const & y6 = vec_tuple.const_at(current_index + 6);
+        vector_base<NumericT> const & y7 = vec_tuple.const_at(current_index + 7);
+        viennacl::ocl::enqueue(inner_prod_kernel_8( viennacl::traits::opencl_handle(x), layout_x,
+                                                    viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+                                                    viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
+                                                    viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
+                                                    viennacl::traits::opencl_handle(y3), detail::make_layout(y3),
+                                                    viennacl::traits::opencl_handle(y4), detail::make_layout(y4),
+                                                    viennacl::traits::opencl_handle(y5), detail::make_layout(y5),
+                                                    viennacl::traits::opencl_handle(y6), detail::make_layout(y6),
+                                                    viennacl::traits::opencl_handle(y7), detail::make_layout(y7),
+                                                    viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 8 * inner_prod_kernel_8.local_work_size()),
+                                                    viennacl::traits::opencl_handle(temp)
+                                                  ) );
+
+        ksum.global_work_size(0, 8 * ksum.local_work_size(0));
+        viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                                    cl_uint(work_groups),
+                                    viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 8 * ksum.local_work_size()),
+                                    viennacl::traits::opencl_handle(result),
+                                    cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+                                    cl_uint(viennacl::traits::stride(result))
+                                    )
+                              );
+      }
+        current_index += 8;
+        break;
+    }
+  }
+
+}
+
+
+
+//implementation of inner product:
+//namespace {
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the gpu)
+*/
+template <typename T>
+void inner_prod_cpu(vector_base<T> const & vec1,
+                    vector_base<T> const & vec2,
+                    T & result)
+{
+  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+
+  vcl_size_t work_groups = 128;
+  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
+  temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
+
+  // Step 1: Compute partial inner products for each work group:
+  inner_prod_impl(vec1, vec2, temp);
+
+  // Step 2: Sum partial results:
+
+  // Now copy partial results from GPU back to CPU and run reduction there:
+  std::vector<T> temp_cpu(work_groups);
+  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+  result = 0;
+  for (typename std::vector<T>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+    result += *it;
+}
+
+
+//////////// Helper for norms
+
+/** @brief Computes the partial work group results for vector norms
+*
+* @param vec The vector
+* @param partial_result The result scalar
+* @param norm_id        Norm selector. 0: norm_inf, 1: norm_1, 2: norm_2
+*/
+template <typename T>
+void norm_reduction_impl(vector_base<T> const & vec,
+                         vector_base<T> & partial_result,
+                          cl_uint norm_id)
+{
+  assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+  viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "norm");
+
+  assert( (k.global_work_size() / k.local_work_size() <= partial_result.size()) && bool("Size mismatch for partial reduction in norm_reduction_impl()") );
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec),
+                           cl_uint(viennacl::traits::start(vec)),
+                           cl_uint(viennacl::traits::stride(vec)),
+                           cl_uint(viennacl::traits::size(vec)),
+                           cl_uint(norm_id),
+                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * k.local_work_size()),
+                           viennacl::traits::opencl_handle(partial_result) )
+                        );
+}
+
+
+//////////// Norm 1
+
+/** @brief Computes the l^1-norm of a vector
+*
+* @param vec The vector
+* @param result The result scalar
+*/
+template <typename T>
+void norm_1_impl(vector_base<T> const & vec,
+                 scalar<T> & result)
+{
+  assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+
+  vcl_size_t work_groups = 128;
+  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+  // Step 1: Compute the partial work group results
+  norm_reduction_impl(vec, temp, 1);
+
+  // Step 2: Compute the partial reduction using OpenCL
+  viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
+
+  ksum.global_work_size(0, ksum.local_work_size(0));
+  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                              cl_uint(viennacl::traits::start(temp)),
+                              cl_uint(viennacl::traits::stride(temp)),
+                              cl_uint(viennacl::traits::size(temp)),
+                              cl_uint(1),
+                              viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
+                              result)
+                        );
+}
+
+/** @brief Computes the l^1-norm of a vector with final reduction on CPU
+*
+* @param vec The vector
+* @param result The result scalar
+*/
+template <typename T>
+void norm_1_cpu(vector_base<T> const & vec,
+                T & result)
+{
+  vcl_size_t work_groups = 128;
+  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+  // Step 1: Compute the partial work group results
+  norm_reduction_impl(vec, temp, 1);
+
+  // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
+  typedef std::vector<typename viennacl::result_of::cl_type<T>::type>  CPUVectorType;
+
+  CPUVectorType temp_cpu(work_groups);
+  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+  result = 0;
+  for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+    result += static_cast<T>(*it);
+}
+
+
+
+//////// Norm 2
+
+
+/** @brief Computes the l^2-norm of a vector - implementation using OpenCL summation at second step
+*
+* @param vec The vector
+* @param result The result scalar
+*/
+template <typename T>
+void norm_2_impl(vector_base<T> const & vec,
+                 scalar<T> & result)
+{
+  assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+
+  vcl_size_t work_groups = 128;
+  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+  // Step 1: Compute the partial work group results
+  norm_reduction_impl(vec, temp, 2);
+
+  // Step 2: Reduction via OpenCL
+  viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
+
+  ksum.global_work_size(0, ksum.local_work_size(0));
+  viennacl::ocl::enqueue( ksum(viennacl::traits::opencl_handle(temp),
+                                cl_uint(viennacl::traits::start(temp)),
+                                cl_uint(viennacl::traits::stride(temp)),
+                                cl_uint(viennacl::traits::size(temp)),
+                                cl_uint(2),
+                                viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
+                                result)
+                        );
+}
+
+/** @brief Computes the l^1-norm of a vector with final reduction on CPU
+*
+* @param vec The vector
+* @param result The result scalar
+*/
+template <typename T>
+void norm_2_cpu(vector_base<T> const & vec,
+                T & result)
+{
+  vcl_size_t work_groups = 128;
+  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+  // Step 1: Compute the partial work group results
+  norm_reduction_impl(vec, temp, 2);
+
+  // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
+  typedef std::vector<typename viennacl::result_of::cl_type<T>::type>  CPUVectorType;
+
+  CPUVectorType temp_cpu(work_groups);
+  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+  result = 0;
+  for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+    result += static_cast<T>(*it);
+  result = std::sqrt(result);
+}
+
+
+
+////////// Norm inf
+
+/** @brief Computes the supremum-norm of a vector
+*
+* @param vec The vector
+* @param result The result scalar
+*/
+template <typename T>
+void norm_inf_impl(vector_base<T> const & vec,
+                   scalar<T> & result)
+{
+  assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+
+  vcl_size_t work_groups = 128;
+  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+  // Step 1: Compute the partial work group results
+  norm_reduction_impl(vec, temp, 0);
+
+  //part 2: parallel reduction of reduced kernel:
+  viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
+
+  ksum.global_work_size(0, ksum.local_work_size(0));
+  viennacl::ocl::enqueue( ksum(viennacl::traits::opencl_handle(temp),
+                               cl_uint(viennacl::traits::start(temp)),
+                               cl_uint(viennacl::traits::stride(temp)),
+                               cl_uint(viennacl::traits::size(temp)),
+                               cl_uint(0),
+                               viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
+                               result)
+                        );
+}
+
+/** @brief Computes the supremum-norm of a vector
+*
+* @param vec The vector
+* @param result The result scalar
+*/
+template <typename T>
+void norm_inf_cpu(vector_base<T> const & vec,
+                  T & result)
+{
+  vcl_size_t work_groups = 128;
+  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+  // Step 1: Compute the partial work group results
+  norm_reduction_impl(vec, temp, 0);
+
+  // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
+  typedef std::vector<typename viennacl::result_of::cl_type<T>::type>  CPUVectorType;
+
+  CPUVectorType temp_cpu(work_groups);
+  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+  result = 0;
+  for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+    result = std::max(result, static_cast<T>(*it));
+}
+
+
+/////////// index norm_inf
+
+//This function should return a CPU scalar, otherwise statements like
+// vcl_rhs[index_norm_inf(vcl_rhs)]
+// are ambiguous
+/** @brief Computes the index of the first entry that is equal to the supremum-norm in modulus.
+*
+* @param vec The vector
+* @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types.
+*/
+template <typename T>
+cl_uint index_norm_inf(vector_base<T> const & vec)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+  viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+  viennacl::ocl::handle<cl_mem> h = ctx.create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint));
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "index_norm_inf");
+  //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
+
+  //TODO: Use multi-group kernel for large vector sizes
+
+  k.global_work_size(0, k.local_work_size());
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec),
+                           cl_uint(viennacl::traits::start(vec)),
+                           cl_uint(viennacl::traits::stride(vec)),
+                           cl_uint(viennacl::traits::size(vec)),
+                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * k.local_work_size()),
+                           viennacl::ocl::local_mem(sizeof(cl_uint) * k.local_work_size()), h));
+
+  //read value:
+  cl_uint result;
+  cl_int err = clEnqueueReadBuffer(ctx.get_queue().handle().get(), h.get(), CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL);
+  VIENNACL_ERR_CHECK(err);
+  return result;
+}
+
+
+////////// max
+
+/** @brief Computes the maximum value of a vector, where the result is stored in an OpenCL buffer.
+*
+* @param x      The vector
+* @param result The result scalar
+*/
+template<typename NumericT>
+void max_impl(vector_base<NumericT> const & x,
+                   scalar<NumericT> & result)
+{
+  assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+  viennacl::linalg::opencl::kernels::vector<NumericT>::init(ctx);
+
+  vcl_size_t work_groups = 128;
+  viennacl::vector<NumericT> temp(work_groups, viennacl::traits::context(x));
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<NumericT>::program_name(), "max_kernel");
+
+  k.global_work_size(0, work_groups * k.local_work_size(0));
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(x),
+                           cl_uint(viennacl::traits::start(x)),
+                           cl_uint(viennacl::traits::stride(x)),
+                           cl_uint(viennacl::traits::size(x)),
+                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * k.local_work_size()),
+                           viennacl::traits::opencl_handle(temp)
+                         ));
+
+  k.global_work_size(0, k.local_work_size());
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(temp),
+                           cl_uint(viennacl::traits::start(temp)),
+                           cl_uint(viennacl::traits::stride(temp)),
+                           cl_uint(viennacl::traits::size(temp)),
+                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * k.local_work_size()),
+                           viennacl::traits::opencl_handle(result)
+                         ));
+}
+
+/** @brief Computes the maximum value of a vector, where the value is stored in a host value.
+*
+* @param x      The vector
+* @param result The result scalar
+*/
+template<typename NumericT>
+void max_cpu(vector_base<NumericT> const & x,
+             NumericT & result)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+  viennacl::linalg::opencl::kernels::vector<NumericT>::init(ctx);
+
+  vcl_size_t work_groups = 128;
+  viennacl::vector<NumericT> temp(work_groups, viennacl::traits::context(x));
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<NumericT>::program_name(), "max_kernel");
+
+  k.global_work_size(0, work_groups * k.local_work_size(0));
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(x),
+                           cl_uint(viennacl::traits::start(x)),
+                           cl_uint(viennacl::traits::stride(x)),
+                           cl_uint(viennacl::traits::size(x)),
+                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * k.local_work_size()),
+                           viennacl::traits::opencl_handle(temp)
+                         ));
+
+  // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
+  typedef std::vector<typename viennacl::result_of::cl_type<NumericT>::type>  CPUVectorType;
+
+  CPUVectorType temp_cpu(work_groups);
+  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+  result = static_cast<NumericT>(temp_cpu[0]);
+  for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+    result = std::max(result, static_cast<NumericT>(*it));
+
+}
+
+
+////////// min
+
+/** @brief Computes the minimum of a vector, where the result is stored in an OpenCL buffer.
+*
+* @param x      The vector
+* @param result The result scalar
+*/
+template<typename NumericT>
+void min_impl(vector_base<NumericT> const & x,
+                   scalar<NumericT> & result)
+{
+  assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+  viennacl::linalg::opencl::kernels::vector<NumericT>::init(ctx);
+
+  vcl_size_t work_groups = 128;
+  viennacl::vector<NumericT> temp(work_groups, viennacl::traits::context(x));
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<NumericT>::program_name(), "min_kernel");
+
+  k.global_work_size(0, work_groups * k.local_work_size(0));
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(x),
+                           cl_uint(viennacl::traits::start(x)),
+                           cl_uint(viennacl::traits::stride(x)),
+                           cl_uint(viennacl::traits::size(x)),
+                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * k.local_work_size()),
+                           viennacl::traits::opencl_handle(temp)
+                         ));
+
+  k.global_work_size(0, k.local_work_size());
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(temp),
+                           cl_uint(viennacl::traits::start(temp)),
+                           cl_uint(viennacl::traits::stride(temp)),
+                           cl_uint(viennacl::traits::size(temp)),
+                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * k.local_work_size()),
+                           viennacl::traits::opencl_handle(result)
+                         ));
+}
+
+/** @brief Computes the minimum of a vector, where the result is stored on a CPU scalar.
+*
+* @param x      The vector
+* @param result The result scalar
+*/
+template<typename NumericT>
+void min_cpu(vector_base<NumericT> const & x,
+                  NumericT & result)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+  viennacl::linalg::opencl::kernels::vector<NumericT>::init(ctx);
+
+  vcl_size_t work_groups = 128;
+  viennacl::vector<NumericT> temp(work_groups, viennacl::traits::context(x));
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<NumericT>::program_name(), "min_kernel");
+
+  k.global_work_size(0, work_groups * k.local_work_size(0));
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(x),
+                           cl_uint(viennacl::traits::start(x)),
+                           cl_uint(viennacl::traits::stride(x)),
+                           cl_uint(viennacl::traits::size(x)),
+                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * k.local_work_size()),
+                           viennacl::traits::opencl_handle(temp)
+                         ));
+
+  // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
+  typedef std::vector<typename viennacl::result_of::cl_type<NumericT>::type>  CPUVectorType;
+
+  CPUVectorType temp_cpu(work_groups);
+  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+  result = static_cast<NumericT>(temp_cpu[0]);
+  for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+    result = std::min(result, static_cast<NumericT>(*it));
+}
+
+////////// sum
+
+/** @brief Computes the sum over all entries of a vector
+*
+* @param x      The vector
+* @param result The result scalar
+*/
+template<typename NumericT>
+void sum_impl(vector_base<NumericT> const & x,
+                   scalar<NumericT> & result)
+{
+  assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::vector<NumericT> all_ones = viennacl::scalar_vector<NumericT>(x.size(), NumericT(1), viennacl::traits::context(x));
+  viennacl::linalg::opencl::inner_prod_impl(x, all_ones, result);
+}
+
+/** @brief Computes the sum over all entries of a vector.
+*
+* @param x      The vector
+* @param result The result scalar
+*/
+template<typename NumericT>
+void sum_cpu(vector_base<NumericT> const & x, NumericT & result)
+{
+  scalar<NumericT> tmp(0, viennacl::traits::context(x));
+  sum_impl(x, tmp);
+  result = tmp;
+}
+
+
+//TODO: Special case vec1 == vec2 allows improvement!!
+/** @brief Computes a plane rotation of two vectors.
+*
+* Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y)
+*
+* @param vec1   The first vector
+* @param vec2   The second vector
+* @param alpha  The first transformation coefficient
+* @param beta   The second transformation coefficient
+*/
+template <typename T>
+void plane_rotation(vector_base<T> & vec1,
+                    vector_base<T> & vec2,
+                    T alpha, T beta)
+{
+  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+  viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+  assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2));
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "plane_rotation");
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                           cl_uint(viennacl::traits::start(vec1)),
+                           cl_uint(viennacl::traits::stride(vec1)),
+                           cl_uint(viennacl::traits::size(vec1)),
+                           viennacl::traits::opencl_handle(vec2),
+                           cl_uint(viennacl::traits::start(vec2)),
+                           cl_uint(viennacl::traits::stride(vec2)),
+                           cl_uint(viennacl::traits::size(vec2)),
+                           viennacl::traits::opencl_handle(alpha),
+                           viennacl::traits::opencl_handle(beta))
+                        );
+}
+
+
+//////////////////////////
+
+
+namespace detail
+{
+  /** @brief Worker routine for scan routines using OpenCL
+   *
+   * Note on performance: For non-in-place scans one could optimize away the temporary 'opencl_carries'-array.
+   * This, however, only provides small savings in the latency-dominated regime, yet would effectively double the amount of code to maintain.
+   */
+  template<typename NumericT>
+  void scan_impl(vector_base<NumericT> const & input,
+                 vector_base<NumericT>       & output,
+                 bool is_inclusive)
+  {
+    vcl_size_t local_worksize = 128;
+    vcl_size_t workgroups = 128;
+
+    viennacl::backend::mem_handle opencl_carries;
+    viennacl::backend::memory_create(opencl_carries, sizeof(NumericT)*workgroups, viennacl::traits::context(input));
+
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+    viennacl::linalg::opencl::kernels::scan<NumericT>::init(ctx);
+    viennacl::ocl::kernel& k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::scan<NumericT>::program_name(), "scan_1");
+    viennacl::ocl::kernel& k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::scan<NumericT>::program_name(), "scan_2");
+    viennacl::ocl::kernel& k3 = ctx.get_kernel(viennacl::linalg::opencl::kernels::scan<NumericT>::program_name(), "scan_3");
+
+    // First step: Scan within each thread group and write carries
+    k1.local_work_size(0, local_worksize);
+    k1.global_work_size(0, workgroups * local_worksize);
+    viennacl::ocl::enqueue(k1( input, cl_uint( input.start()), cl_uint( input.stride()), cl_uint(input.size()),
+                              output, cl_uint(output.start()), cl_uint(output.stride()),
+                              cl_uint(is_inclusive ? 0 : 1), opencl_carries.opencl_handle())
+                          );
+
+    // Second step: Compute offset for each thread group (exclusive scan for each thread group)
+    k2.local_work_size(0, workgroups);
+    k2.global_work_size(0, workgroups);
+    viennacl::ocl::enqueue(k2(opencl_carries.opencl_handle()));
+
+    // Third step: Offset each thread group accordingly
+    k3.local_work_size(0, local_worksize);
+    k3.global_work_size(0, workgroups * local_worksize);
+    viennacl::ocl::enqueue(k3(output, cl_uint(output.start()), cl_uint(output.stride()), cl_uint(output.size()),
+                              opencl_carries.opencl_handle())
+                          );
+  }
+}
+
+
+/** @brief This function implements an inclusive scan using CUDA.
+*
+* @param input       Input vector.
+* @param output      The output vector. Either idential to input or non-overlapping.
+*/
+template<typename NumericT>
+void inclusive_scan(vector_base<NumericT> const & input,
+                    vector_base<NumericT>       & output)
+{
+  detail::scan_impl(input, output, true);
+}
+
+
+/** @brief This function implements an exclusive scan using CUDA.
+*
+* @param input       Input vector
+* @param output      The output vector. Either idential to input or non-overlapping.
+*/
+template<typename NumericT>
+void exclusive_scan(vector_base<NumericT> const & input,
+                    vector_base<NumericT>       & output)
+{
+  detail::scan_impl(input, output, false);
+}
+
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/power_iter.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/power_iter.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/power_iter.hpp
new file mode 100644
index 0000000..9721517
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/power_iter.hpp
@@ -0,0 +1,129 @@
+#ifndef VIENNACL_LINALG_POWER_ITER_HPP_
+#define VIENNACL_LINALG_POWER_ITER_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/power_iter.hpp
+    @brief Defines a tag for the configuration of the power iteration method.
+
+    Contributed by Astrid Rupp.
+*/
+
+#include <cmath>
+#include <vector>
+#include "viennacl/linalg/bisect.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    /** @brief A tag for the power iteration algorithm. */
+    class power_iter_tag
+    {
+      public:
+
+        /** @brief The constructor
+        *
+        * @param tfac      If the eigenvalue does not change more than this termination factor, the algorithm stops
+        * @param max_iters Maximum number of iterations for the power iteration
+        */
+        power_iter_tag(double tfac = 1e-8, vcl_size_t max_iters = 50000) : termination_factor_(tfac), max_iterations_(max_iters) {}
+
+        /** @brief Sets the factor for termination */
+        void factor(double fct){ termination_factor_ = fct; }
+
+          /** @brief Returns the factor for termination */
+        double factor() const { return termination_factor_; }
+
+        vcl_size_t max_iterations() const { return max_iterations_; }
+        void max_iterations(vcl_size_t new_max) { max_iterations_ = new_max; }
+
+      private:
+        double termination_factor_;
+        vcl_size_t max_iterations_;
+
+    };
+
+   /**
+    *   @brief Implementation of the calculation of the largest eigenvalue (in modulus) and the associated eigenvector using power iteration
+    *
+    *   @param A             The system matrix
+    *   @param tag           Tag with termination factor
+    *   @param eigenvec      Vector which holds the associated eigenvector once the routine completes
+    *   @return              Returns the largest eigenvalue computed by the power iteration method
+    */
+    template<typename MatrixT, typename VectorT >
+    typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
+    eig(MatrixT const& A, power_iter_tag const & tag, VectorT & eigenvec)
+    {
+
+      typedef typename viennacl::result_of::value_type<MatrixT>::type           ScalarType;
+      typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+
+      vcl_size_t matrix_size = A.size1();
+      VectorT r(eigenvec);
+      std::vector<CPU_ScalarType> s(matrix_size);
+
+      for (vcl_size_t i=0; i<s.size(); ++i)
+        s[i] = CPU_ScalarType(i % 3) * CPU_ScalarType(0.1234) - CPU_ScalarType(0.5);   //'random' starting vector
+
+      detail::copy_vec_to_vec(s, eigenvec);
+
+      double epsilon = tag.factor();
+      CPU_ScalarType norm = norm_2(eigenvec);
+      CPU_ScalarType norm_prev = 0;
+      long numiter = 0;
+
+      for (vcl_size_t i=0; i<tag.max_iterations(); ++i)
+      {
+        if (std::fabs(norm - norm_prev) / std::fabs(norm) < epsilon)
+          break;
+
+        eigenvec /= norm;
+        r = viennacl::linalg::prod(A, eigenvec);  //using helper vector r for the computation of x <- A * x in order to avoid the repeated creation of temporaries
+        eigenvec = r;
+        norm_prev = norm;
+        norm = norm_2(eigenvec);
+        numiter++;
+      }
+
+      return norm;
+    }
+
+    /**
+     *   @brief Implementation of the calculation of eigenvalues using power iteration. Does not return the eigenvector.
+     *
+     *   @param A             The system matrix
+     *   @param tag           Tag with termination factor
+     *   @return              Returns the largest eigenvalue computed by the power iteration method
+     */
+    template< typename MatrixT >
+    typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
+    eig(MatrixT const& A, power_iter_tag const & tag)
+    {
+      typedef typename viennacl::result_of::vector_for_matrix<MatrixT>::type    VectorT;
+
+      VectorT eigenvec(A.size1());
+      return eig(A, tag, eigenvec);
+    }
+
+  } // end namespace linalg
+} // end namespace viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/prod.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/prod.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/prod.hpp
new file mode 100644
index 0000000..af041dc
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/prod.hpp
@@ -0,0 +1,370 @@
+#ifndef VIENNACL_LINALG_PROD_HPP_
+#define VIENNACL_LINALG_PROD_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/prod.hpp
+    @brief Generic interface for matrix-vector and matrix-matrix products.
+           See viennacl/linalg/vector_operations.hpp, viennacl/linalg/matrix_operations.hpp, and
+           viennacl/linalg/sparse_matrix_operations.hpp for implementations.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+#include <vector>
+#include <map>
+
+namespace viennacl
+{
+  //
+  // generic prod function
+  //   uses tag dispatch to identify which algorithm
+  //   should be called
+  //
+  namespace linalg
+  {
+    #ifdef VIENNACL_WITH_MTL4
+    // ----------------------------------------------------
+    // mtl4
+    //
+    template< typename MatrixT, typename VectorT >
+    typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< MatrixT >::type >::value,
+                                  VectorT>::type
+    prod(MatrixT const& matrix, VectorT const& vector)
+    {
+      return VectorT(matrix * vector);
+    }
+    #endif
+
+    #ifdef VIENNACL_WITH_ARMADILLO
+    // ----------------------------------------------------
+    // Armadillo
+    //
+    template<typename NumericT, typename VectorT>
+    VectorT prod(arma::SpMat<NumericT> const& A, VectorT const& vector)
+    {
+      return A * vector;
+    }
+    #endif
+
+    #ifdef VIENNACL_WITH_EIGEN
+    // ----------------------------------------------------
+    // Eigen
+    //
+    template< typename MatrixT, typename VectorT >
+    typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< MatrixT >::type >::value,
+                                  VectorT>::type
+    prod(MatrixT const& matrix, VectorT const& vector)
+    {
+      return matrix * vector;
+    }
+    #endif
+
+    #ifdef VIENNACL_WITH_UBLAS
+    // ----------------------------------------------------
+    // UBLAS
+    //
+    template< typename MatrixT, typename VectorT >
+    typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< MatrixT >::type >::value,
+                                  VectorT>::type
+    prod(MatrixT const& matrix, VectorT const& vector)
+    {
+      // std::cout << "ublas .. " << std::endl;
+      return boost::numeric::ublas::prod(matrix, vector);
+    }
+    #endif
+
+
+    // ----------------------------------------------------
+    // STL type
+    //
+
+    // dense matrix-vector product:
+    template< typename T, typename A1, typename A2, typename VectorT >
+    VectorT
+    prod(std::vector< std::vector<T, A1>, A2 > const & matrix, VectorT const& vector)
+    {
+      VectorT result(matrix.size());
+      for (typename std::vector<T, A1>::size_type i=0; i<matrix.size(); ++i)
+      {
+        result[i] = 0; //we will not assume that VectorT is initialized to zero
+        for (typename std::vector<T, A1>::size_type j=0; j<matrix[i].size(); ++j)
+          result[i] += matrix[i][j] * vector[j];
+      }
+      return result;
+    }
+
+    // sparse matrix-vector product:
+    template< typename KEY, typename DATA, typename COMPARE, typename AMAP, typename AVEC, typename VectorT >
+    VectorT
+    prod(std::vector< std::map<KEY, DATA, COMPARE, AMAP>, AVEC > const& matrix, VectorT const& vector)
+    {
+      typedef std::vector< std::map<KEY, DATA, COMPARE, AMAP>, AVEC > MatrixType;
+
+      VectorT result(matrix.size());
+      for (typename MatrixType::size_type i=0; i<matrix.size(); ++i)
+      {
+        result[i] = 0; //we will not assume that VectorT is initialized to zero
+        for (typename std::map<KEY, DATA, COMPARE, AMAP>::const_iterator row_entries = matrix[i].begin();
+             row_entries != matrix[i].end();
+             ++row_entries)
+          result[i] += row_entries->second * vector[row_entries->first];
+      }
+      return result;
+    }
+
+
+    /*template< typename MatrixT, typename VectorT >
+    VectorT
+    prod(MatrixT const& matrix, VectorT const& vector,
+         typename viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< MatrixT >::type >::value
+                                     >::type* dummy = 0)
+    {
+      // std::cout << "std .. " << std::endl;
+      return prod_impl(matrix, vector);
+    }*/
+
+    // ----------------------------------------------------
+    // VIENNACL
+    //
+
+    // standard product:
+    template<typename NumericT>
+    viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+                                 const viennacl::matrix_base<NumericT>,
+                                 viennacl::op_mat_mat_prod >
+    prod(viennacl::matrix_base<NumericT> const & A,
+         viennacl::matrix_base<NumericT> const & B)
+    {
+      return viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+                                          const viennacl::matrix_base<NumericT>,
+                                          viennacl::op_mat_mat_prod >(A, B);
+    }
+
+    // right factor is a matrix expression:
+    template<typename NumericT, typename LhsT, typename RhsT, typename OpT>
+    viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+                                 const viennacl::matrix_expression<const LhsT, const RhsT, OpT>,
+                                 viennacl::op_mat_mat_prod >
+    prod(viennacl::matrix_base<NumericT> const & A,
+         viennacl::matrix_expression<const LhsT, const RhsT, OpT> const & B)
+    {
+      return viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+                                          const viennacl::matrix_expression<const LhsT, const RhsT, OpT>,
+                                          viennacl::op_mat_mat_prod >(A, B);
+    }
+
+    // left factor is a matrix expression:
+    template<typename LhsT, typename RhsT, typename OpT, typename NumericT>
+    viennacl::matrix_expression< const viennacl::matrix_expression<const LhsT, const RhsT, OpT>,
+                                 const viennacl::matrix_base<NumericT>,
+                                 viennacl::op_mat_mat_prod >
+    prod(viennacl::matrix_expression<const LhsT, const RhsT, OpT> const & A,
+         viennacl::matrix_base<NumericT> const & B)
+    {
+      return viennacl::matrix_expression< const viennacl::matrix_expression<const LhsT, const RhsT, OpT>,
+                                          const viennacl::matrix_base<NumericT>,
+                                          viennacl::op_mat_mat_prod >(A, B);
+    }
+
+
+    // both factors transposed:
+    template<typename LhsT1, typename RhsT1, typename OpT1,
+             typename LhsT2, typename RhsT2, typename OpT2>
+    viennacl::matrix_expression< const viennacl::matrix_expression<const LhsT1, const RhsT1, OpT1>,
+                                 const viennacl::matrix_expression<const LhsT2, const RhsT2, OpT2>,
+                                 viennacl::op_mat_mat_prod >
+    prod(viennacl::matrix_expression<const LhsT1, const RhsT1, OpT1> const & A,
+         viennacl::matrix_expression<const LhsT2, const RhsT2, OpT2> const & B)
+    {
+      return viennacl::matrix_expression< const viennacl::matrix_expression<const LhsT1, const RhsT1, OpT1>,
+                                          const viennacl::matrix_expression<const LhsT2, const RhsT2, OpT2>,
+                                          viennacl::op_mat_mat_prod >(A, B);
+    }
+
+
+
+    // matrix-vector product
+    template< typename NumericT>
+    viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+                                 const viennacl::vector_base<NumericT>,
+                                 viennacl::op_prod >
+    prod(viennacl::matrix_base<NumericT> const & A,
+         viennacl::vector_base<NumericT> const & x)
+    {
+      return viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+                                          const viennacl::vector_base<NumericT>,
+                                          viennacl::op_prod >(A, x);
+    }
+
+    // matrix-vector product (resolve ambiguity)
+    template<typename NumericT, typename F>
+    viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+                                 const viennacl::vector_base<NumericT>,
+                                 viennacl::op_prod >
+    prod(viennacl::matrix<NumericT, F> const & A,
+         viennacl::vector_base<NumericT> const & x)
+    {
+      return viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+                                          const viennacl::vector_base<NumericT>,
+                                          viennacl::op_prod >(A, x);
+    }
+
+    // matrix-vector product (resolve ambiguity)
+    template<typename MatrixT, typename NumericT>
+    viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+                                 const viennacl::vector_base<NumericT>,
+                                 viennacl::op_prod >
+    prod(viennacl::matrix_range<MatrixT> const & A,
+         viennacl::vector_base<NumericT> const & x)
+    {
+      return viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+                                          const viennacl::vector_base<NumericT>,
+                                          viennacl::op_prod >(A, x);
+    }
+
+    // matrix-vector product (resolve ambiguity)
+    template<typename MatrixT, typename NumericT>
+    viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+                                 const viennacl::vector_base<NumericT>,
+                                 viennacl::op_prod >
+    prod(viennacl::matrix_slice<MatrixT> const & A,
+         viennacl::vector_base<NumericT> const & x)
+    {
+      return viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+                                          const viennacl::vector_base<NumericT>,
+                                          viennacl::op_prod >(A, x);
+    }
+
+    // matrix-vector product with matrix expression (including transpose)
+    template< typename NumericT, typename LhsT, typename RhsT, typename OpT>
+    viennacl::vector_expression< const viennacl::matrix_expression<const LhsT, const RhsT, OpT>,
+                                 const viennacl::vector_base<NumericT>,
+                                 viennacl::op_prod >
+    prod(viennacl::matrix_expression<const LhsT, const RhsT, OpT> const & A,
+         viennacl::vector_base<NumericT> const & x)
+    {
+      return viennacl::vector_expression< const viennacl::matrix_expression<const LhsT, const RhsT, OpT>,
+                                          const viennacl::vector_base<NumericT>,
+                                          viennacl::op_prod >(A, x);
+    }
+
+
+    // matrix-vector product with vector expression
+    template< typename NumericT, typename LhsT, typename RhsT, typename OpT>
+    viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+                                 const viennacl::vector_expression<const LhsT, const RhsT, OpT>,
+                                 viennacl::op_prod >
+    prod(viennacl::matrix_base<NumericT> const & A,
+         viennacl::vector_expression<const LhsT, const RhsT, OpT> const & x)
+    {
+      return viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+                                          const viennacl::vector_expression<const LhsT, const RhsT, OpT>,
+                                          viennacl::op_prod >(A, x);
+    }
+
+
+    // matrix-vector product with matrix expression (including transpose) and vector expression
+    template<typename LhsT1, typename RhsT1, typename OpT1,
+             typename LhsT2, typename RhsT2, typename OpT2>
+    viennacl::vector_expression< const viennacl::matrix_expression<const LhsT1, const RhsT1, OpT1>,
+                                 const viennacl::vector_expression<const LhsT2, const RhsT2, OpT2>,
+                                 viennacl::op_prod >
+    prod(viennacl::matrix_expression<const LhsT1, const RhsT1, OpT1> const & A,
+         viennacl::vector_expression<const LhsT2, const RhsT2, OpT2> const & x)
+    {
+      return viennacl::vector_expression< const viennacl::matrix_expression<const LhsT1, const RhsT1, OpT1>,
+                                          const viennacl::vector_expression<const LhsT2, const RhsT2, OpT2>,
+                                          viennacl::op_prod >(A, x);
+    }
+
+
+
+
+    template< typename SparseMatrixType, typename SCALARTYPE>
+    typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                  viennacl::matrix_expression<const SparseMatrixType,
+                                                              const matrix_base <SCALARTYPE>,
+                                                              op_prod >
+                                 >::type
+    prod(const SparseMatrixType & sp_mat,
+         const viennacl::matrix_base<SCALARTYPE> & d_mat)
+    {
+      return viennacl::matrix_expression<const SparseMatrixType,
+                                         const viennacl::matrix_base<SCALARTYPE>,
+                                         op_prod >(sp_mat, d_mat);
+    }
+
+    // right factor is transposed
+    template< typename SparseMatrixType, typename SCALARTYPE>
+    typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                  viennacl::matrix_expression< const SparseMatrixType,
+                                                               const viennacl::matrix_expression<const viennacl::matrix_base<SCALARTYPE>,
+                                                                                                 const viennacl::matrix_base<SCALARTYPE>,
+                                                                                                 op_trans>,
+                                                               viennacl::op_prod >
+                                  >::type
+    prod(const SparseMatrixType & A,
+         viennacl::matrix_expression<const viennacl::matrix_base<SCALARTYPE>,
+                                     const viennacl::matrix_base<SCALARTYPE>,
+                                     op_trans> const & B)
+    {
+      return viennacl::matrix_expression< const SparseMatrixType,
+                                          const viennacl::matrix_expression<const viennacl::matrix_base<SCALARTYPE>,
+                                                                            const viennacl::matrix_base<SCALARTYPE>,
+                                                                            op_trans>,
+                                          viennacl::op_prod >(A, B);
+    }
+
+
+    /** @brief Sparse matrix-matrix product with compressed_matrix objects */
+    template<typename NumericT>
+    viennacl::matrix_expression<const compressed_matrix<NumericT>,
+                                const compressed_matrix<NumericT>,
+                                op_prod >
+    prod(compressed_matrix<NumericT> const & A,
+         compressed_matrix<NumericT> const & B)
+    {
+      return viennacl::matrix_expression<const compressed_matrix<NumericT>,
+                                         const compressed_matrix<NumericT>,
+                                         op_prod >(A, B);
+    }
+
+    /** @brief Generic matrix-vector product with user-provided sparse matrix type */
+    template<typename SparseMatrixType, typename NumericT>
+    vector_expression<const SparseMatrixType,
+                      const vector_base<NumericT>,
+                      op_prod >
+    prod(const SparseMatrixType & A,
+         const vector_base<NumericT> & x)
+    {
+      return vector_expression<const SparseMatrixType,
+                               const vector_base<NumericT>,
+                               op_prod >(A, x);
+    }
+
+  } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+


[24/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp
new file mode 100644
index 0000000..dba094b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp
@@ -0,0 +1,832 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/spai/spai.hpp
+    @brief Main implementation of SPAI (not FSPAI). Experimental.
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+
+//local includes
+#include "viennacl/linalg/detail/spai/spai_tag.hpp"
+#include "viennacl/linalg/qr.hpp"
+#include "viennacl/linalg/detail/spai/spai-dynamic.hpp"
+#include "viennacl/linalg/detail/spai/spai-static.hpp"
+#include "viennacl/linalg/detail/spai/sparse_vector.hpp"
+#include "viennacl/linalg/detail/spai/block_matrix.hpp"
+#include "viennacl/linalg/detail/spai/block_vector.hpp"
+
+//boost includes
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/lu.hpp"
+#include "boost/numeric/ublas/triangular.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+
+// ViennaCL includes
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/linalg/opencl/kernels/spai.hpp"
+
+
+
+#define VIENNACL_SPAI_K_b 20
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+//debug function for print
+template<typename SparseVectorT>
+void print_sparse_vector(SparseVectorT const & v)
+{
+  for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it!= v.end(); ++vec_it)
+    std::cout << "[ " << vec_it->first << " ]:" << vec_it->second << std::endl;
+}
+
+template<typename DenseMatrixT>
+void print_matrix(DenseMatrixT & m)
+{
+  for (int i = 0; i < m.size2(); ++i)
+  {
+    for (int j = 0; j < m.size1(); ++j)
+      std::cout<<m(j, i)<<" ";
+    std::cout<<std::endl;
+  }
+}
+
+/** @brief Add two sparse vectors res_v = b*v
+ *
+ * @param v      initial sparse vector
+ * @param b      scalar
+ * @param res_v  output vector
+ */
+template<typename SparseVectorT, typename NumericT>
+void add_sparse_vectors(SparseVectorT const & v, NumericT b,  SparseVectorT & res_v)
+{
+  for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it)
+    res_v[v_it->first] += b*v_it->second;
+}
+
+//sparse-matrix - vector product
+/** @brief Computation of residual res = A*v - e
+ *
+ * @param A_v_c   column major vectorized input sparse matrix
+ * @param v       sparse vector, in this case new column of preconditioner matrix
+ * @param ind     index for current column
+ * @param res     residual
+ */
+template<typename SparseVectorT, typename NumericT>
+void compute_spai_residual(std::vector<SparseVectorT> const & A_v_c,
+                           SparseVectorT const & v,
+                           unsigned int ind,
+                           SparseVectorT & res)
+{
+  for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it)
+    add_sparse_vectors(A_v_c[v_it->first], v_it->second, res);
+
+  res[ind] -= NumericT(1);
+}
+
+/** @brief Setting up index set of columns and rows for certain column
+ *
+ * @param A_v_c   column major vectorized initial sparse matrix
+ * @param v       current column of preconditioner matrix
+ * @param J       set of column indices
+ * @param I       set of row indices
+ */
+template<typename SparseVectorT>
+void build_index_set(std::vector<SparseVectorT> const & A_v_c,
+                     SparseVectorT const & v,
+                     std::vector<unsigned int> & J,
+                     std::vector<unsigned int> & I)
+{
+  buildColumnIndexSet(v, J);
+  projectRows(A_v_c, J, I);
+}
+
+/** @brief Initializes a dense matrix from a sparse one
+ *
+ * @param A_in    Oiginal sparse matrix
+ * @param J       Set of column indices
+ * @param I       Set of row indices
+ * @param A_out   dense matrix output
+ */
+template<typename SparseMatrixT, typename DenseMatrixT>
+void initProjectSubMatrix(SparseMatrixT const & A_in,
+                          std::vector<unsigned int> const & J,
+                          std::vector<unsigned int> & I,
+                          DenseMatrixT & A_out)
+{
+  A_out.resize(I.size(), J.size(), false);
+  for (vcl_size_t j = 0; j < J.size(); ++j)
+    for (vcl_size_t i = 0; i < I.size(); ++i)
+      A_out(i,j) = A_in(I[i],J[j]);
+}
+
+
+/************************************************** CPU BLOCK SET UP ***************************************/
+
+/** @brief Setting up blocks and QR factorizing them on CPU
+ *
+ * @param A        initial sparse matrix
+ * @param A_v_c    column major vectorized initial sparse matrix
+ * @param M_v      initialized preconditioner
+ * @param g_I      container of row indices
+ * @param g_J      container of column indices
+ * @param g_A_I_J  container of dense matrices -> R matrices after QR factorization
+ * @param g_b_v    container of vectors beta, necessary for Q recovery
+ */
+template<typename SparseMatrixT, typename DenseMatrixT, typename SparseVectorT, typename VectorT>
+void block_set_up(SparseMatrixT const & A,
+                  std::vector<SparseVectorT> const & A_v_c,
+                  std::vector<SparseVectorT> const & M_v,
+                  std::vector<std::vector<unsigned int> >& g_I,
+                  std::vector<std::vector<unsigned int> >& g_J,
+                  std::vector<DenseMatrixT>& g_A_I_J,
+                  std::vector<VectorT>& g_b_v)
+{
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
+  {
+    vcl_size_t i = static_cast<vcl_size_t>(i2);
+    build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
+    initProjectSubMatrix(A, g_J[i], g_I[i], g_A_I_J[i]);
+    //print_matrix(g_A_I_J[i]);
+    single_qr(g_A_I_J[i], g_b_v[i]);
+    //print_matrix(g_A_I_J[i]);
+  }
+}
+
+/** @brief Setting up index set of columns and rows for all columns
+ *
+ * @param A_v_c   column major vectorized initial sparse matrix
+ * @param M_v     initialized preconditioner
+ * @param g_J     container of column indices
+ * @param g_I     container of row indices
+ */
+template<typename SparseVectorT>
+void index_set_up(std::vector<SparseVectorT> const & A_v_c,
+                  std::vector<SparseVectorT> const & M_v,
+                  std::vector<std::vector<unsigned int> > & g_J,
+                  std::vector<std::vector<unsigned int> > & g_I)
+{
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
+  {
+    vcl_size_t i = static_cast<vcl_size_t>(i2);
+    build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
+  }
+}
+
+/************************************************** GPU BLOCK SET UP ***************************************/
+
+/** @brief Setting up blocks and QR factorizing them on GPU
+ *
+ * @param A            initial sparse matrix
+ * @param A_v_c        column major vectorized initial sparse matrix
+ * @param M_v          initialized preconditioner
+ * @param g_is_update  container that indicates which blocks are active
+ * @param g_I          container of row indices
+ * @param g_J          container of column indices
+ * @param g_A_I_J      container of dense matrices -> R matrices after QR factorization
+ * @param g_bv         container of vectors beta, necessary for Q recovery
+ */
+template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT>
+void block_set_up(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+                  std::vector<SparseVectorT> const & A_v_c,
+                  std::vector<SparseVectorT> const & M_v,
+                  std::vector<cl_uint> g_is_update,
+                  std::vector<std::vector<unsigned int> > & g_I,
+                  std::vector<std::vector<unsigned int> > & g_J,
+                  block_matrix & g_A_I_J,
+                  block_vector & g_bv)
+{
+  viennacl::context ctx = viennacl::traits::context(A);
+  bool is_empty_block;
+
+  //build index set
+  index_set_up(A_v_c, M_v, g_J, g_I);
+  block_assembly(A, g_J, g_I, g_A_I_J, g_is_update, is_empty_block);
+  block_qr<NumericT>(g_I, g_J, g_A_I_J, g_bv, g_is_update, ctx);
+}
+
+
+/***************************************************************************************************/
+/******************************** SOLVING LS PROBLEMS ON GPU ***************************************/
+/***************************************************************************************************/
+
+/** @brief Elicitation of sparse vector m for particular column from m_in - contigious vector for all columns
+ *
+ * @param m_in          contigious sparse vector for all columns
+ * @param start_m_ind   start index of particular vector
+ * @param J             column index set
+ * @param m             sparse vector for particular column
+ */
+template<typename NumericT, typename SparseVectorT>
+void custom_fan_out(std::vector<NumericT> const & m_in,
+                    unsigned int start_m_ind,
+                    std::vector<unsigned int> const & J,
+                    SparseVectorT & m)
+{
+  unsigned int  cnt = 0;
+  for (vcl_size_t i = 0; i < J.size(); ++i)
+    m[J[i]] = m_in[start_m_ind + cnt++];
+}
+
+
+
+//GPU based least square problem
+/** @brief Solution of Least square problem on GPU
+ *
+ * @param A_v_c        column-major vectorized initial sparse matrix
+ * @param M_v          column-major vectorized sparse preconditioner matrix
+ * @param g_I          container of row set indices
+ * @param g_J          container of column set indices
+ * @param g_A_I_J_vcl  contigious matrix that consists of blocks A(I_k, J_k)
+ * @param g_bv_vcl     contigious vector that consists of betas, necessary for Q recovery
+ * @param g_res        container of residuals
+ * @param g_is_update  container with indicators which blocks are active
+ * @param tag          spai tag
+ * @param ctx          Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
+ */
+template<typename SparseVectorT, typename NumericT>
+void least_square_solve(std::vector<SparseVectorT> & A_v_c,
+                        std::vector<SparseVectorT> & M_v,
+                        std::vector<std::vector<unsigned int> >& g_I,
+                        std::vector<std::vector<unsigned int> > & g_J,
+                        block_matrix & g_A_I_J_vcl,
+                        block_vector & g_bv_vcl,
+                        std::vector<SparseVectorT> & g_res,
+                        std::vector<cl_uint> & g_is_update,
+                        const spai_tag & tag,
+                        viennacl::context ctx)
+{
+  viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+  unsigned int y_sz, m_sz;
+  std::vector<cl_uint> y_inds(M_v.size() + 1, static_cast<cl_uint>(0));
+  std::vector<cl_uint> m_inds(M_v.size() + 1, static_cast<cl_uint>(0));
+
+  get_size(g_I, y_sz);
+  init_start_inds(g_I, y_inds);
+  init_start_inds(g_J, m_inds);
+
+  //create y_v
+  std::vector<NumericT> y_v(y_sz, NumericT(0));
+  for (vcl_size_t i = 0; i < M_v.size(); ++i)
+  {
+    for (vcl_size_t j = 0; j < g_I[i].size(); ++j)
+    {
+      if (g_I[i][j] == i)
+        y_v[y_inds[i] + j] = NumericT(1.0);
+    }
+  }
+  //compute m_v
+  get_size(g_J, m_sz);
+  std::vector<NumericT> m_v(m_sz, static_cast<cl_uint>(0));
+
+  block_vector y_v_vcl;
+  block_vector m_v_vcl;
+  //prepearing memory for least square problem on GPU
+  y_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                              static_cast<unsigned int>(sizeof(NumericT)*y_v.size()),
+                                              &(y_v[0]));
+  m_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                              static_cast<unsigned int>(sizeof(NumericT)*m_v.size()),
+                                              &(m_v[0]));
+  y_v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                               static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                               &(y_inds[0]));
+  viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                           static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
+                                                                           &(g_is_update[0]));
+  viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+  viennacl::ocl::kernel & ls_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_least_squares");
+  ls_kernel.local_work_size(0, 1);
+  ls_kernel.global_work_size(0, 256);
+  viennacl::ocl::enqueue(ls_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), m_v_vcl.handle(),
+                                   y_v_vcl.handle(), y_v_vcl.handle1(),
+                                   g_A_I_J_vcl.handle1(), g_is_update_vcl,
+                                   //viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
+                                   static_cast<unsigned int>(M_v.size())));
+  //copy vector m_v back from GPU to CPU
+  cl_int vcl_err = clEnqueueReadBuffer(opencl_ctx.get_queue().handle().get(),
+                                       m_v_vcl.handle().get(), CL_TRUE, 0,
+                                       sizeof(NumericT)*(m_v.size()),
+                                       &(m_v[0]), 0, NULL, NULL);
+  VIENNACL_ERR_CHECK(vcl_err);
+
+  //fan out vector in parallel
+  //#pragma omp parallel for
+  for (long i = 0; i < static_cast<long>(M_v.size()); ++i)
+  {
+    if (g_is_update[static_cast<vcl_size_t>(i)])
+    {
+      //faned out onto sparse vector
+      custom_fan_out(m_v, m_inds[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], M_v[static_cast<vcl_size_t>(i)]);
+      g_res[static_cast<vcl_size_t>(i)].clear();
+      compute_spai_residual<SparseVectorT, NumericT>(A_v_c,  M_v[static_cast<vcl_size_t>(i)], static_cast<unsigned int>(i), g_res[static_cast<vcl_size_t>(i)]);
+      NumericT res_norm = 0;
+      //compute norm of res - just to make sure that this implementatino works correct
+      sparse_norm_2(g_res[static_cast<vcl_size_t>(i)], res_norm);
+      //std::cout<<"Residual norm of column #: "<<i<<std::endl;
+      //std::cout<<res_norm<<std::endl;
+      //std::cout<<"************************"<<std::endl;
+      g_is_update[static_cast<vcl_size_t>(i)] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic())?(1):(0);
+    }
+  }
+}
+
+//CPU based least square problems
+/** @brief Solution of Least square problem on CPU
+ *
+ * @param A_v_c        column-major vectorized initial sparse matrix
+ * @param g_R          blocks for least square solution
+ * @param g_b_v        vectors beta, necessary for Q recovery
+ * @param g_I          container of row index set for all columns of matrix M
+ * @param g_J          container of column index set for all columns of matrix M
+ * @param g_res        container of residuals
+ * @param g_is_update  container with indicators which blocks are active
+ * @param M_v          column-major vectorized sparse matrix, final preconditioner
+ * @param tag          spai tag
+ */
+template<typename SparseVectorT, typename DenseMatrixT, typename VectorT>
+void least_square_solve(std::vector<SparseVectorT> const & A_v_c,
+                        std::vector<DenseMatrixT> & g_R,
+                        std::vector<VectorT> & g_b_v,
+                        std::vector<std::vector<unsigned int> > & g_I,
+                        std::vector<std::vector<unsigned int> > & g_J,
+                        std::vector<SparseVectorT> & g_res,
+                        std::vector<bool> & g_is_update,
+                        std::vector<SparseVectorT> & M_v,
+                        spai_tag const & tag)
+{
+  typedef typename DenseMatrixT::value_type       NumericType;
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
+  {
+    vcl_size_t i = static_cast<vcl_size_t>(i2);
+    if (g_is_update[i])
+    {
+      VectorT y = boost::numeric::ublas::zero_vector<NumericType>(g_I[i].size());
+
+      projectI<VectorT, NumericType>(g_I[i], y, static_cast<unsigned int>(tag.getBegInd() + long(i)));
+      apply_q_trans_vec(g_R[i], g_b_v[i], y);
+
+      VectorT m_new =  boost::numeric::ublas::zero_vector<NumericType>(g_R[i].size2());
+      backwardSolve(g_R[i], y, m_new);
+      fanOutVector(m_new, g_J[i], M_v[i]);
+      g_res[i].clear();
+
+      compute_spai_residual<SparseVectorT, NumericType>(A_v_c,  M_v[i], static_cast<unsigned int>(tag.getBegInd() + long(i)), g_res[i]);
+
+      NumericType res_norm = 0;
+      sparse_norm_2(g_res[i], res_norm);
+//                    std::cout<<"Residual norm of column #: "<<i<<std::endl;
+//                    std::cout<<res_norm<<std::endl;
+//                    std::cout<<"************************"<<std::endl;
+      g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic());
+    }
+  }
+}
+
+//************************************ UPDATE CHECK ***************************************************//
+
+template<typename VectorType>
+bool is_all_update(VectorType& parallel_is_update)
+{
+  for (unsigned int i = 0; i < parallel_is_update.size(); ++i)
+  {
+    if (parallel_is_update[i])
+      return true;
+  }
+  return false;
+}
+
+//********************************** MATRIX VECTORIZATION ***********************************************//
+
+//Matrix vectorization, column based approach
+/** @brief Solution of Least square problem on CPU
+ *
+ * @param M_in   input sparse, boost::numeric::ublas::compressed_matrix
+ * @param M_v    array of sparse vectors
+ */
+template<typename SparseMatrixT, typename SparseVectorT>
+void vectorize_column_matrix(SparseMatrixT const & M_in,
+                             std::vector<SparseVectorT> & M_v)
+{
+  for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it)
+    for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+        M_v[static_cast<unsigned int>(col_it.index2())][static_cast<unsigned int>(col_it.index1())] = *col_it;
+}
+
+//Matrix vectorization row based approach
+template<typename SparseMatrixT, typename SparseVectorT>
+void vectorize_row_matrix(SparseMatrixT const & M_in,
+                          std::vector<SparseVectorT> & M_v)
+{
+  for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it)
+    for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+      M_v[static_cast<unsigned int>(col_it.index1())][static_cast<unsigned int>(col_it.index2())] = *col_it;
+}
+
+//************************************* BLOCK ASSEMBLY CODE *********************************************//
+
+
+template<typename SizeT>
+void write_set_to_array(std::vector<std::vector<SizeT> > const & ind_set,
+                        std::vector<cl_uint> & a)
+{
+  vcl_size_t cnt = 0;
+
+  for (vcl_size_t i = 0; i < ind_set.size(); ++i)
+    for (vcl_size_t j = 0; j < ind_set[i].size(); ++j)
+      a[cnt++] = static_cast<cl_uint>(ind_set[i][j]);
+}
+
+
+
+//assembling blocks on GPU
+/** @brief Assembly of blocks on GPU by a gived set of row indices: g_I and column indices: g_J
+ *
+ * @param A               intial sparse matrix
+ * @param g_J             container of column index set
+ * @param g_I             container of row index set
+ * @param g_A_I_J_vcl     contigious blocks A(I, J) using GPU memory
+ * @param g_is_update     container with indicators which blocks are active
+ * @param is_empty_block  parameter that indicates if no block were assembled
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void block_assembly(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+                    std::vector<std::vector<unsigned int> > const & g_J,
+                    std::vector<std::vector<unsigned int> > const & g_I,
+                    block_matrix & g_A_I_J_vcl,
+                    std::vector<cl_uint> & g_is_update,
+                    bool & is_empty_block)
+{
+  //computing start indices for index sets and start indices for block matrices
+  unsigned int sz_I, sz_J, sz_blocks;
+  std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
+  std::vector<cl_uint> i_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+  std::vector<cl_uint> j_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+  std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+  //
+  init_start_inds(g_J, j_ind);
+  init_start_inds(g_I, i_ind);
+  //
+  get_size(g_J, sz_J);
+  get_size(g_I, sz_I);
+  std::vector<cl_uint> I_set(sz_I, static_cast<cl_uint>(0));
+  //
+  std::vector<cl_uint> J_set(sz_J, static_cast<cl_uint>(0));
+
+  // computing size for blocks
+  // writing set to arrays
+  write_set_to_array(g_I, I_set);
+  write_set_to_array(g_J, J_set);
+
+  // if block for assembly does exist
+  if (I_set.size() > 0 && J_set.size() > 0)
+  {
+    viennacl::context ctx = viennacl::traits::context(A);
+    viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+    compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
+    std::vector<NumericT> con_A_I_J(sz_blocks, NumericT(0));
+
+    block_vector set_I_vcl, set_J_vcl;
+    //init memory on GPU
+    //contigious g_A_I_J
+    g_A_I_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                    static_cast<unsigned int>(sizeof(NumericT)*(sz_blocks)),
+                                                    &(con_A_I_J[0]));
+    g_A_I_J_vcl.handle().context(opencl_ctx);
+
+    //matrix_dimensions
+    g_A_I_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                     static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<cl_uint>(g_I.size())),
+                                                     &(matrix_dims[0]));
+    g_A_I_J_vcl.handle1().context(opencl_ctx);
+
+    //start_block inds
+    g_A_I_J_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                     static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                                     &(blocks_ind[0]));
+    g_A_I_J_vcl.handle2().context(opencl_ctx);
+
+    //set_I
+    set_I_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                  static_cast<unsigned int>(sizeof(cl_uint)*sz_I),
+                                                  &(I_set[0]));
+    set_I_vcl.handle().context(opencl_ctx);
+
+    //set_J
+    set_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                  static_cast<unsigned int>(sizeof(cl_uint)*sz_J),
+                                                  &(J_set[0]));
+    set_J_vcl.handle().context(opencl_ctx);
+
+    //i_ind
+    set_I_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                   static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                                   &(i_ind[0]));
+    set_I_vcl.handle().context(opencl_ctx);
+
+    //j_ind
+    set_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                   static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                                   &(j_ind[0]));
+    set_J_vcl.handle().context(opencl_ctx);
+
+    viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                             static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
+                                                                             &(g_is_update[0]));
+
+    viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+    viennacl::ocl::kernel& assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "assemble_blocks");
+    assembly_kernel.local_work_size(0, 1);
+    assembly_kernel.global_work_size(0, 256);
+    viennacl::ocl::enqueue(assembly_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+                                           set_I_vcl.handle(), set_J_vcl.handle(), set_I_vcl.handle1(),
+                                           set_J_vcl.handle1(),
+                                           g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), g_A_I_J_vcl.handle(),
+                                           g_is_update_vcl,
+                                           static_cast<unsigned int>(g_I.size())));
+    is_empty_block = false;
+  }
+  else
+    is_empty_block = true;
+}
+
+/************************************************************************************************************************/
+
+/** @brief Insertion of vectorized matrix column into original sparse matrix
+ *
+ * @param M_v       column-major vectorized matrix
+ * @param M         original sparse matrix
+ * @param is_right  indicates if matrix should be transposed in the output
+ */
+template<typename SparseMatrixT, typename SparseVectorT>
+void insert_sparse_columns(std::vector<SparseVectorT> const & M_v,
+                           SparseMatrixT& M,
+                           bool is_right)
+{
+  if (is_right)
+  {
+    for (unsigned int i = 0; i < M_v.size(); ++i)
+      for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it)
+        M(vec_it->first, i) = vec_it->second;
+  }
+  else  //transposed fill of M
+  {
+    for (unsigned int i = 0; i < M_v.size(); ++i)
+      for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it)
+        M(i, vec_it->first) = vec_it->second;
+  }
+}
+
+/** @brief Transposition of sparse matrix
+ *
+ * @param A_in      intial sparse matrix
+ * @param A output  transposed matrix
+ */
+template<typename MatrixT>
+void sparse_transpose(MatrixT const & A_in, MatrixT & A)
+{
+  typedef typename MatrixT::value_type         NumericType;
+
+  std::vector<std::map<vcl_size_t, NumericType> >   temp_A(A_in.size2());
+  A.resize(A_in.size2(), A_in.size1(), false);
+
+  for (typename MatrixT::const_iterator1 row_it = A_in.begin1();
+       row_it != A_in.end1();
+       ++row_it)
+  {
+    for (typename MatrixT::const_iterator2 col_it = row_it.begin();
+         col_it != row_it.end();
+         ++col_it)
+    {
+      temp_A[col_it.index2()][col_it.index1()] = *col_it;
+    }
+  }
+
+  for (vcl_size_t i=0; i<temp_A.size(); ++i)
+  {
+    for (typename std::map<vcl_size_t, NumericType>::const_iterator it = temp_A[i].begin();
+         it != temp_A[i].end();
+         ++it)
+      A(i, it->first) = it->second;
+  }
+}
+
+
+
+
+//        template<typename SparseVectorType>
+//        void custom_copy(std::vector<SparseVectorType> & M_v, std::vector<SparseVectorType> & l_M_v, const unsigned int beg_ind){
+//            for (int i = 0; i < l_M_v.size(); ++i){
+//                l_M_v[i] = M_v[i + beg_ind];
+//            }
+//        }
+
+//CPU version
+/** @brief Construction of SPAI preconditioner on CPU
+ *
+ * @param A     initial sparse matrix
+ * @param M     output preconditioner
+ * @param tag   spai tag
+ */
+template<typename MatrixT>
+void computeSPAI(MatrixT const & A,
+                 MatrixT & M,
+                 spai_tag & tag)
+{
+  typedef typename MatrixT::value_type                                       NumericT;
+  typedef typename boost::numeric::ublas::vector<NumericT>                   VectorType;
+  typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT>   SparseVectorType;
+  typedef typename boost::numeric::ublas::matrix<NumericT>                   DenseMatrixType;
+
+  //sparse matrix transpose...
+  unsigned int cur_iter = 0;
+  tag.setBegInd(0); tag.setEndInd(VIENNACL_SPAI_K_b);
+  bool go_on = true;
+  std::vector<SparseVectorType> A_v_c(M.size2());
+  std::vector<SparseVectorType> M_v(M.size2());
+  vectorize_column_matrix(A, A_v_c);
+  vectorize_column_matrix(M, M_v);
+
+
+  while (go_on)
+  {
+    go_on = (tag.getEndInd() < static_cast<long>(M.size2()));
+    cur_iter = 0;
+    unsigned int l_sz = static_cast<unsigned int>(tag.getEndInd() - tag.getBegInd());
+    //std::vector<bool> g_is_update(M.size2(), true);
+    std::vector<bool> g_is_update(l_sz, true);
+
+    //init is update
+    //init_parallel_is_update(g_is_update);
+    //std::vector<SparseVectorType> A_v_c(K);
+    //std::vector<SparseVectorType> M_v(K);
+    //vectorization of marices
+    //print_matrix(M_v);
+
+    std::vector<SparseVectorType> l_M_v(l_sz);
+    //custom_copy(M_v, l_M_v, beg_ind);
+    std::copy(M_v.begin() + tag.getBegInd(), M_v.begin() + tag.getEndInd(), l_M_v.begin());
+
+    //print_matrix(l_M_v);
+    //std::vector<SparseVectorType> l_A_v_c(K);
+    //custom_copy(A_v_c, l_A_v_c, beg_ind);
+    //std::copy(A_v_c.begin() + beg_ind, A_v_c.begin() + end_ind, l_A_v_c.begin());
+    //print_matrix(l_A_v_c);
+    //vectorize_row_matrix(A, A_v_r);
+    //working blocks
+
+    std::vector<DenseMatrixType> g_A_I_J(l_sz);
+    std::vector<VectorType> g_b_v(l_sz);
+    std::vector<SparseVectorType> g_res(l_sz);
+    std::vector<std::vector<unsigned int> > g_I(l_sz);
+    std::vector<std::vector<unsigned int> > g_J(l_sz);
+
+    while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))
+    {
+      // SET UP THE BLOCKS..
+      // PHASE ONE
+      if (cur_iter == 0)
+        block_set_up(A, A_v_c, l_M_v,  g_I, g_J, g_A_I_J, g_b_v);
+      else
+        block_update(A, A_v_c, g_res, g_is_update, g_I, g_J, g_b_v, g_A_I_J, tag);
+
+      //PHASE TWO, LEAST SQUARE SOLUTION
+      least_square_solve(A_v_c, g_A_I_J, g_b_v, g_I, g_J, g_res, g_is_update, l_M_v, tag);
+
+      if (tag.getIsStatic()) break;
+      cur_iter++;
+    }
+
+    std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
+    tag.setBegInd(tag.getEndInd());//beg_ind = end_ind;
+    tag.setEndInd(std::min(static_cast<long>(tag.getBegInd() + VIENNACL_SPAI_K_b), static_cast<long>(M.size2())));
+    //std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
+  }
+
+  M.resize(M.size1(), M.size2(), false);
+  insert_sparse_columns(M_v, M, tag.getIsRight());
+}
+
+
+//GPU - based version
+/** @brief Construction of SPAI preconditioner on GPU
+ *
+ * @param A      initial sparse matrix
+ * @param cpu_A  copy of initial matrix on CPU
+ * @param cpu_M  output preconditioner on CPU
+ * @param M      output preconditioner
+ * @param tag    SPAI tag class with parameters
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void computeSPAI(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, //input
+                 boost::numeric::ublas::compressed_matrix<NumericT> const & cpu_A,
+                 boost::numeric::ublas::compressed_matrix<NumericT> & cpu_M, //output
+                 viennacl::compressed_matrix<NumericT, AlignmentV> & M,
+                 spai_tag const & tag)
+{
+  typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT>        SparseVectorType;
+
+  //typedef typename viennacl::compressed_matrix<ScalarType> GPUSparseMatrixType;
+  //sparse matrix transpose...
+  unsigned int cur_iter = 0;
+  std::vector<cl_uint> g_is_update(cpu_M.size2(), static_cast<cl_uint>(1));
+  //init is update
+  //init_parallel_is_update(g_is_update);
+  std::vector<SparseVectorType> A_v_c(cpu_M.size2());
+  std::vector<SparseVectorType> M_v(cpu_M.size2());
+  vectorize_column_matrix(cpu_A, A_v_c);
+  vectorize_column_matrix(cpu_M, M_v);
+  std::vector<SparseVectorType> g_res(cpu_M.size2());
+  std::vector<std::vector<unsigned int> > g_I(cpu_M.size2());
+  std::vector<std::vector<unsigned int> > g_J(cpu_M.size2());
+
+  //OpenCL variables
+  block_matrix g_A_I_J_vcl;
+  block_vector g_bv_vcl;
+  while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))
+  {
+    // SET UP THE BLOCKS..
+    // PHASE ONE..
+    //timer.start();
+    //index set up on CPU
+    if (cur_iter == 0)
+      block_set_up(A, A_v_c, M_v, g_is_update, g_I, g_J, g_A_I_J_vcl, g_bv_vcl);
+    else
+      block_update(A, A_v_c, g_is_update, g_res, g_J, g_I, g_A_I_J_vcl, g_bv_vcl, tag);
+    //std::cout<<"Phase 2 timing: "<<timer.get()<<std::endl;
+    //PERFORM LEAST SQUARE problems solution
+    //PHASE TWO
+    //timer.start();
+    least_square_solve<SparseVectorType, NumericT>(A_v_c, M_v, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, g_res, g_is_update, tag, viennacl::traits::context(A));
+    //std::cout<<"Phase 3 timing: "<<timer.get()<<std::endl;
+    if (tag.getIsStatic())
+      break;
+    cur_iter++;
+  }
+
+  cpu_M.resize(cpu_M.size1(), cpu_M.size2(), false);
+  insert_sparse_columns(M_v, cpu_M, tag.getIsRight());
+  //copy back to GPU
+  M.resize(static_cast<unsigned int>(cpu_M.size1()), static_cast<unsigned int>(cpu_M.size2()));
+  viennacl::copy(cpu_M, M);
+}
+
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp
new file mode 100644
index 0000000..d8c718c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp
@@ -0,0 +1,143 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_TAG_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_TAG_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/detail/spai/spai_tag.hpp
+    @brief Implementation of the spai tag holding SPAI configuration parameters. Experimental.
+
+    SPAI code contributed by Nikolay Lukash
+*/
+
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <cmath>
+#include <sstream>
+#include "viennacl/ocl/backend.hpp"
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+#include "boost/numeric/ublas/detail/matrix_assign.hpp"
+
+#include "viennacl/linalg/detail/spai/block_matrix.hpp"
+#include "viennacl/linalg/detail/spai/block_vector.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/** @brief A tag for SPAI
+ *
+ * Contains values for the algorithm.
+ * Must be passed to spai_precond constructor
+ */
+class spai_tag
+{
+  /** @brief Constructor
+   *
+   * @param residual_norm_threshold   Calculate until the norm of the residual falls below this threshold
+   * @param iteration_limit           maximum number of iterations
+   * @param residual_threshold        determines starting threshold in residual vector for including new indices into set J
+   * @param is_static                 determines if static version of SPAI should be used
+   * @param is_right                  determines if left or right preconditioner should be used
+   */
+public:
+  spai_tag(double residual_norm_threshold = 1e-3,
+           unsigned int iteration_limit = 5,
+           double residual_threshold = 1e-2,
+           bool is_static = false,
+           bool is_right = false)
+    : residual_norm_threshold_(residual_norm_threshold),
+      iteration_limit_(iteration_limit),
+      residual_threshold_(residual_threshold),
+      is_static_(is_static),
+      is_right_(is_right) {}
+
+  double getResidualNormThreshold() const { return residual_norm_threshold_; }
+
+  double getResidualThreshold() const { return residual_threshold_; }
+
+  unsigned int getIterationLimit () const { return iteration_limit_; }
+
+  bool getIsStatic() const { return is_static_; }
+
+  bool getIsRight() const { return is_right_; }
+
+  long getBegInd() const { return beg_ind_; }
+
+  long getEndInd() const { return end_ind_; }
+
+
+
+  void setResidualNormThreshold(double residual_norm_threshold)
+  {
+    if (residual_norm_threshold > 0)
+      residual_norm_threshold_ = residual_norm_threshold;
+  }
+
+  void setResidualThreshold(double residual_threshold)
+  {
+    if (residual_threshold > 0)
+      residual_threshold_ = residual_threshold;
+  }
+
+  void setIterationLimit(unsigned int iteration_limit)
+  {
+    if (iteration_limit > 0)
+      iteration_limit_ = iteration_limit;
+  }
+
+  void setIsRight(bool is_right) { is_right_ = is_right; }
+
+  void setIsStatic(bool is_static) { is_static_ = is_static; }
+
+  void setBegInd(long beg_ind) { beg_ind_ = beg_ind; }
+
+  void setEndInd(long end_ind){ end_ind_ = end_ind; }
+
+
+private:
+  double        residual_norm_threshold_;
+  unsigned int  iteration_limit_;
+  long          beg_ind_;
+  long          end_ind_;
+  double        residual_threshold_;
+  bool          is_static_;
+  bool          is_right_;
+};
+
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp
new file mode 100644
index 0000000..c99eda1
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp
@@ -0,0 +1,85 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPARSE_VECTOR_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SPARSE_VECTOR_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/spai/sparse_vector.hpp
+    @brief Implementation of a helper sparse vector class for SPAI. Experimental.
+
+    SPAI code contributed by Nikolay Lukash
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/**
+ * @brief Represents a sparse vector based on std::map<unsigned int, NumericT>
+ */
+template<typename NumericT>
+class sparse_vector
+{
+public:
+  typedef typename std::map<unsigned int, NumericT>::iterator        iterator;
+  typedef typename std::map<unsigned int, NumericT>::const_iterator  const_iterator;
+
+  sparse_vector() {}
+
+  /** @brief Set the index of the vector in the original matrix
+   *
+   * May only be called once.
+   */
+  //getter
+  NumericT & operator[] (unsigned int ind) { return v_[ind]; }
+
+  void clear() { v_.clear(); }
+
+  const_iterator find(unsigned int var) const { return v_.find(var); }
+        iterator find(unsigned int var)       { return v_.find(var); }
+
+  const_iterator begin() const { return v_.begin(); }
+        iterator begin()       { return v_.begin(); }
+  const_iterator end() const { return v_.end(); }
+        iterator end()       { return v_.end(); }
+
+private:
+  unsigned int                      size_;
+  std::map<unsigned int, NumericT>  v_;
+};
+
+}
+}
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp
new file mode 100644
index 0000000..a3340d7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp
@@ -0,0 +1,580 @@
+#ifndef VIENNACL_LINALG_DIRECT_SOLVE_HPP_
+#define VIENNACL_LINALG_DIRECT_SOLVE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/direct_solve.hpp
+    @brief Implementations of dense direct solvers are found here.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/host_based/direct_solve.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/direct_solve.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/direct_solve.hpp"
+#endif
+
+#define VIENNACL_DIRECT_SOLVE_BLOCKSIZE 128
+
+namespace viennacl
+{
+namespace linalg
+{
+
+namespace detail
+{
+
+  //
+  // A \ B:
+  //
+
+  /** @brief Direct inplace solver for dense triangular systems using a single kernel launch. Matlab notation: A \ B
+  *
+  * @param A    The system matrix
+  * @param B    The matrix of row vectors, where the solution is directly written to
+  */
+  template<typename NumericT, typename SolverTagT>
+  void inplace_solve_kernel(const matrix_base<NumericT>  & A, const matrix_base<NumericT> & B, SolverTagT)
+  {
+    assert( (viennacl::traits::size1(A) == viennacl::traits::size2(A)) && bool("Size check failed in inplace_solve(): size1(A) != size2(A)"));
+    assert( (viennacl::traits::size1(A) == viennacl::traits::size1(B)) && bool("Size check failed in inplace_solve(): size1(A) != size1(B)"));
+    switch (viennacl::traits::handle(A).get_active_handle_id())
+    {
+      case viennacl::MAIN_MEMORY:
+        viennacl::linalg::host_based::inplace_solve(A, const_cast<matrix_base<NumericT> &>(B), SolverTagT());
+        break;
+  #ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::inplace_solve(A, const_cast<matrix_base<NumericT> &>(B), SolverTagT());
+        break;
+  #endif
+  #ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::inplace_solve(A, const_cast<matrix_base<NumericT> &>(B), SolverTagT());
+        break;
+  #endif
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+
+  //
+  // A \ b
+  //
+
+  template<typename NumericT, typename SolverTagT>
+  void inplace_solve_vec_kernel(const matrix_base<NumericT> & mat,
+                                const vector_base<NumericT> & vec,
+                                SolverTagT)
+  {
+    assert( (mat.size1() == vec.size()) && bool("Size check failed in inplace_solve(): size1(A) != size(b)"));
+    assert( (mat.size2() == vec.size()) && bool("Size check failed in inplace_solve(): size2(A) != size(b)"));
+
+    switch (viennacl::traits::handle(mat).get_active_handle_id())
+    {
+      case viennacl::MAIN_MEMORY:
+        viennacl::linalg::host_based::inplace_solve(mat, const_cast<vector_base<NumericT> &>(vec), SolverTagT());
+        break;
+  #ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::inplace_solve(mat, const_cast<vector_base<NumericT> &>(vec), SolverTagT());
+        break;
+  #endif
+  #ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::inplace_solve(mat, const_cast<vector_base<NumericT> &>(vec), SolverTagT());
+        break;
+  #endif
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+
+  template<typename MatrixT1, typename MatrixT2, typename SolverTagT>
+  void inplace_solve_lower_impl(MatrixT1 const & A, MatrixT2 & B, SolverTagT)
+  {
+    typedef typename viennacl::result_of::cpu_value_type<MatrixT1>::type  NumericType;
+
+    vcl_size_t blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
+    if (A.size1() <= blockSize)
+      inplace_solve_kernel(A, B, SolverTagT());
+    else
+    {
+      for (vcl_size_t i = 0; i < A.size1(); i = i + blockSize)
+      {
+        vcl_size_t Apos1 = i;
+        vcl_size_t Apos2 = std::min<vcl_size_t>(A.size1(), i + blockSize);
+        vcl_size_t Bpos = B.size2();
+        inplace_solve_kernel(viennacl::project(A, viennacl::range(Apos1, Apos2), viennacl::range(Apos1, Apos2)),
+                             viennacl::project(B, viennacl::range(Apos1, Apos2), viennacl::range(0,     Bpos)),
+                             SolverTagT());
+        if (Apos2 < A.size1())
+        {
+          viennacl::matrix_range<MatrixT2> B_lower(B, viennacl::range(Apos2, B.size1()), viennacl::range(0, Bpos));
+          viennacl::linalg::prod_impl(viennacl::project(A, viennacl::range(Apos2, A.size1()), viennacl::range(Apos1, Apos2)),
+                                      viennacl::project(B, viennacl::range(Apos1, Apos2),     viennacl::range(0,     Bpos)),
+                                      B_lower,
+                                      NumericType(-1.0), NumericType(1.0));
+        }
+      }
+    }
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, viennacl::linalg::lower_tag)
+  {
+    inplace_solve_lower_impl(A, B, viennacl::linalg::lower_tag());
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, viennacl::linalg::unit_lower_tag)
+  {
+    inplace_solve_lower_impl(A, B, viennacl::linalg::unit_lower_tag());
+  }
+
+  template<typename MatrixT1, typename MatrixT2, typename SolverTagT>
+  void inplace_solve_upper_impl(MatrixT1 const & A, MatrixT2 & B, SolverTagT)
+  {
+    typedef typename viennacl::result_of::cpu_value_type<MatrixT1>::type  NumericType;
+
+    int blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
+    if (static_cast<int>(A.size1()) <= blockSize)
+      inplace_solve_kernel(A, B, SolverTagT());
+    else
+    {
+      for (int i = static_cast<int>(A.size1()); i > 0; i = i - blockSize)
+      {
+        vcl_size_t Apos1 = vcl_size_t(std::max<int>(0, i - blockSize));
+        vcl_size_t Apos2 = vcl_size_t(i);
+        vcl_size_t Bpos = B.size2();
+        inplace_solve_kernel(viennacl::project(A, viennacl::range(Apos1, Apos2), viennacl::range(Apos1, Apos2)),
+                             viennacl::project(B, viennacl::range(Apos1, Apos2), viennacl::range(0, Bpos)),
+                             SolverTagT());
+        if (Apos1 > 0)
+        {
+          viennacl::matrix_range<MatrixT2> B_upper(B, viennacl::range(0, Apos1), viennacl::range(0, Bpos));
+
+          viennacl::linalg::prod_impl(viennacl::project(A, viennacl::range(0,     Apos1), viennacl::range(Apos1, Apos2)),
+                                      viennacl::project(B, viennacl::range(Apos1, Apos2), viennacl::range(0,     Bpos)),
+                                      B_upper,
+                                      NumericType(-1.0), NumericType(1.0));
+        }
+      }
+    }
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, viennacl::linalg::upper_tag)
+  {
+    inplace_solve_upper_impl(A, B, viennacl::linalg::upper_tag());
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, viennacl::linalg::unit_upper_tag)
+  {
+    inplace_solve_upper_impl(A, B, viennacl::linalg::unit_upper_tag());
+  }
+
+} // namespace detail
+
+/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B   (MATLAB notation)
+*
+* @param A      The system matrix
+* @param B      The matrix of row vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(const matrix_base<NumericT> & A,
+                   matrix_base<NumericT> & B,
+                   SolverTagT)
+{
+  detail::inplace_solve_impl(A,B,SolverTagT());
+}
+
+/** @brief Direct inplace solver for triangular systems with multiple transposed right hand sides, i.e. A \ B^T   (MATLAB notation)
+*
+* @param A       The system matrix
+* @param proxy_B The proxy for the transposed matrix of row vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(const matrix_base<NumericT> & A,
+                   matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> proxy_B,
+                   SolverTagT)
+{
+  typedef typename matrix_base<NumericT>::handle_type    handle_type;
+
+  matrix_base<NumericT> B(const_cast<handle_type &>(proxy_B.lhs().handle()),
+                          proxy_B.lhs().size2(), proxy_B.lhs().start2(), proxy_B.lhs().stride2(), proxy_B.lhs().internal_size2(),
+                          proxy_B.lhs().size1(), proxy_B.lhs().start1(), proxy_B.lhs().stride1(), proxy_B.lhs().internal_size1(),
+                          !proxy_B.lhs().row_major());
+
+  detail::inplace_solve_impl(A,B,SolverTagT());
+}
+
+//upper triangular solver for transposed lower triangular matrices
+/** @brief Direct inplace solver for transposed triangular systems with multiple right hand sides, i.e. A^T \ B   (MATLAB notation)
+*
+* @param proxy_A  The transposed system matrix proxy
+* @param B        The matrix holding the load vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans>  & proxy_A,
+                   matrix_base<NumericT> & B,
+                   SolverTagT)
+{
+  typedef typename matrix_base<NumericT>::handle_type    handle_type;
+
+  matrix_base<NumericT> A(const_cast<handle_type &>(proxy_A.lhs().handle()),
+                          proxy_A.lhs().size2(), proxy_A.lhs().start2(), proxy_A.lhs().stride2(), proxy_A.lhs().internal_size2(),
+                          proxy_A.lhs().size1(), proxy_A.lhs().start1(), proxy_A.lhs().stride1(), proxy_A.lhs().internal_size1(),
+                          !proxy_A.lhs().row_major());
+
+  detail::inplace_solve_impl(A,B,SolverTagT());
+}
+
+/** @brief Direct inplace solver for transposed triangular systems with multiple transposed right hand sides, i.e. A^T \ B^T   (MATLAB notation)
+*
+* @param proxy_A    The transposed system matrix proxy
+* @param proxy_B    The transposed matrix holding the load vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> const & proxy_A,
+                   matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans>         proxy_B,
+                   SolverTagT)
+{
+  typedef typename matrix_base<NumericT>::handle_type    handle_type;
+
+  matrix_base<NumericT> A(const_cast<handle_type &>(proxy_A.lhs().handle()),
+                          proxy_A.lhs().size2(), proxy_A.lhs().start2(), proxy_A.lhs().stride2(), proxy_A.lhs().internal_size2(),
+                          proxy_A.lhs().size1(), proxy_A.lhs().start1(), proxy_A.lhs().stride1(), proxy_A.lhs().internal_size1(),
+                          !proxy_A.lhs().row_major());
+
+  matrix_base<NumericT> B(const_cast<handle_type &>(proxy_B.lhs().handle()),
+                          proxy_B.lhs().size2(), proxy_B.lhs().start2(), proxy_B.lhs().stride2(), proxy_B.lhs().internal_size2(),
+                          proxy_B.lhs().size1(), proxy_B.lhs().start1(), proxy_B.lhs().stride1(), proxy_B.lhs().internal_size1(),
+                          !proxy_B.lhs().row_major());
+
+  detail::inplace_solve_impl(A,B,SolverTagT());
+}
+
+
+/////////////////// general wrappers for non-inplace solution //////////////////////
+
+
+/** @brief Convenience functions for C = solve(A, B, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve()
+*
+* @param A    The system matrix
+* @param B    The matrix of load vectors
+* @param tag    Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+matrix_base<NumericT> solve(const matrix_base<NumericT> & A,
+                            const matrix_base<NumericT> & B,
+                            SolverTagT tag)
+{
+  // do an inplace solve on the result vector:
+  matrix_base<NumericT> result(B);
+  inplace_solve(A, result, tag);
+  return result;
+}
+
+/** @brief Convenience functions for C = solve(A, B^T, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve()
+*
+* @param A    The system matrix
+* @param proxy  The transposed load vector
+* @param tag    Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+matrix_base<NumericT> solve(const matrix_base<NumericT> & A,
+                            const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy,
+                            SolverTagT tag)
+{
+  // do an inplace solve on the result vector:
+  matrix_base<NumericT> result(proxy);
+  inplace_solve(A, result, tag);
+  return result;
+}
+
+/** @brief Convenience functions for result = solve(trans(mat), B, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve()
+*
+* @param proxy  The transposed system matrix proxy
+* @param B      The matrix of load vectors
+* @param tag    Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+matrix_base<NumericT> solve(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy,
+                            const matrix_base<NumericT> & B,
+                            SolverTagT tag)
+{
+  // do an inplace solve on the result vector:
+  matrix_base<NumericT> result(B);
+  inplace_solve(proxy, result, tag);
+  return result;
+}
+
+/** @brief Convenience functions for result = solve(trans(mat), vec, some_tag()); Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param proxy_A  The transposed system matrix proxy
+* @param proxy_B  The transposed matrix of load vectors, where the solution is directly written to
+* @param tag    Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+matrix_base<NumericT> solve(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy_A,
+                            const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy_B,
+                            SolverTagT tag)
+{
+  // run an inplace solve on the result vector:
+  matrix_base<NumericT> result(proxy_B);
+  inplace_solve(proxy_A, result, tag);
+  return result;
+}
+
+//
+/////////// solves with vector as right hand side ///////////////////
+//
+
+namespace detail
+{
+  template<typename MatrixT1, typename VectorT, typename SolverTagT>
+  void inplace_solve_lower_vec_impl(MatrixT1 const & A, VectorT & b, SolverTagT)
+  {
+    vcl_size_t blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
+    if (A.size1() <= blockSize)
+      inplace_solve_vec_kernel(A, b, SolverTagT());
+    else
+    {
+      VectorT temp(b);
+      for (vcl_size_t i = 0; i < A.size1(); i = i + blockSize)
+      {
+        vcl_size_t Apos1 = i;
+        vcl_size_t Apos2 = std::min<vcl_size_t>(A.size1(), i + blockSize);
+        inplace_solve_vec_kernel(viennacl::project(A, viennacl::range(Apos1, Apos2), viennacl::range(Apos1, Apos2)),
+                                 viennacl::project(b, viennacl::range(Apos1, Apos2)),
+                                 SolverTagT());
+        if (Apos2 < A.size1())
+        {
+          viennacl::project(temp, viennacl::range(Apos2, A.size1())) = viennacl::linalg::prod(viennacl::project(A, viennacl::range(Apos2, A.size1()), viennacl::range(Apos1, Apos2)),
+                                                                                              viennacl::project(b, viennacl::range(Apos1, Apos2)));
+          viennacl::project(b, viennacl::range(Apos2, A.size1())) -= viennacl::project(temp, viennacl::range(Apos2, A.size1()));
+        }
+      }
+    }
+  }
+
+  template<typename MatrixT1, typename VectorT>
+  void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & B, viennacl::linalg::lower_tag)
+  {
+    inplace_solve_lower_vec_impl(A, B, viennacl::linalg::lower_tag());
+  }
+
+  template<typename MatrixT1, typename VectorT>
+  void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & B, viennacl::linalg::unit_lower_tag)
+  {
+    inplace_solve_lower_vec_impl(A, B, viennacl::linalg::unit_lower_tag());
+  }
+
+  template<typename MatrixT1, typename VectorT, typename SolverTagT>
+  void inplace_solve_upper_vec_impl(MatrixT1 const & A, VectorT & b, SolverTagT)
+  {
+    int blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
+    if (static_cast<int>(A.size1()) <= blockSize)
+      inplace_solve_vec_kernel(A, b, SolverTagT());
+    else
+    {
+      VectorT temp(b);
+      for (int i = static_cast<int>(A.size1()); i > 0; i = i - blockSize)
+      {
+        vcl_size_t Apos1 = vcl_size_t(std::max<int>(0, i - blockSize));
+        vcl_size_t Apos2 = vcl_size_t(i);
+        inplace_solve_vec_kernel(viennacl::project(A, viennacl::range(Apos1, Apos2), viennacl::range(Apos1, Apos2)),
+                                 viennacl::project(b, viennacl::range(Apos1, Apos2)),
+                                 SolverTagT());
+        if (Apos1 > 0)
+        {
+          viennacl::project(temp, viennacl::range(0, Apos1)) = viennacl::linalg::prod(viennacl::project(A, viennacl::range(0,     Apos1), viennacl::range(Apos1, Apos2)),
+                                                                                      viennacl::project(b, viennacl::range(Apos1, Apos2)));
+          viennacl::project(b, viennacl::range(0, Apos1)) -= viennacl::project(temp, viennacl::range(0, Apos1));
+        }
+      }
+    }
+  }
+
+  template<typename MatrixT1, typename VectorT>
+  void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & b, viennacl::linalg::upper_tag)
+  {
+    inplace_solve_upper_vec_impl(A, b, viennacl::linalg::upper_tag());
+  }
+
+  template<typename MatrixT1, typename VectorT>
+  void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & b, viennacl::linalg::unit_upper_tag)
+  {
+    inplace_solve_upper_vec_impl(A, b, viennacl::linalg::unit_upper_tag());
+  }
+
+} // namespace detail
+
+/** @brief Inplace solution of a triangular system. Matlab notation A \ b.
+*
+* @param mat    The system matrix (a dense matrix for which only the respective triangular form is used)
+* @param vec    The right hand side vector
+* @param tag    The tag (either lower_tag, unit_lower_tag, upper_tag, or unit_upper_tag)
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(const matrix_base<NumericT> & mat,
+                   vector_base<NumericT> & vec,
+                   SolverTagT const & tag)
+{
+
+  detail::inplace_solve_vec_impl(mat, vec, tag);
+}
+
+/** @brief Inplace solution of a triangular system with transposed system matrix.. Matlab notation A' \ b.
+*
+* @param proxy  The transposed system matrix (a dense matrix for which only the respective triangular form is used)
+* @param vec    The right hand side vector
+* @param tag    The tag (either lower_tag, unit_lower_tag, upper_tag, or unit_upper_tag)
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> const & proxy,
+                   vector_base<NumericT> & vec,
+                   SolverTagT const & tag)
+{
+  typedef typename matrix_base<NumericT>::handle_type    handle_type;
+
+  // wrap existing matrix in a new matrix_base object (no data copy)
+  matrix_base<NumericT> mat(const_cast<handle_type &>(proxy.lhs().handle()),
+                            proxy.lhs().size2(), proxy.lhs().start2(), proxy.lhs().stride2(), proxy.lhs().internal_size2(),
+                            proxy.lhs().size1(), proxy.lhs().start1(), proxy.lhs().stride1(), proxy.lhs().internal_size1(),
+                            !proxy.lhs().row_major());
+  detail::inplace_solve_vec_impl(mat, vec, tag);
+}
+
+
+/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for an upper triangular solve.
+*
+* Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param mat    The system matrix
+* @param vec    The load vector
+* @param tag    Dispatch tag
+*/
+template<typename NumericT>
+vector<NumericT> solve(const matrix_base<NumericT> & mat,
+                       const vector_base<NumericT> & vec,
+                       viennacl::linalg::upper_tag const & tag)
+{
+  // run an inplace solve on the result vector:
+  vector<NumericT> result(vec);
+  inplace_solve(mat, result, tag);
+  return result;
+}
+
+/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for an upper triangular solve with unit diagonal.
+*
+* Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param mat    The system matrix
+* @param vec    The load vector
+* @param tag    Dispatch tag
+*/
+template<typename NumericT>
+vector<NumericT> solve(const matrix_base<NumericT> & mat,
+                       const vector_base<NumericT> & vec,
+                       viennacl::linalg::unit_upper_tag const & tag)
+{
+  // run an inplace solve on the result vector:
+  vector<NumericT> result(vec);
+  inplace_solve(mat, result, tag);
+  return result;
+}
+
+/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for a lower triangular solve.
+*
+* Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param mat    The system matrix
+* @param vec    The load vector
+* @param tag    Dispatch tag
+*/
+template<typename NumericT>
+vector<NumericT> solve(const matrix_base<NumericT> & mat,
+                       const vector_base<NumericT> & vec,
+                       viennacl::linalg::lower_tag const & tag)
+{
+  // run an inplace solve on the result vector:
+  vector<NumericT> result(vec);
+  inplace_solve(mat, result, tag);
+  return result;
+}
+
+/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for a lower triangular solve with unit diagonal.
+*
+* Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param mat    The system matrix
+* @param vec    The load vector
+* @param tag    Dispatch tag
+*/
+template<typename NumericT>
+vector<NumericT> solve(const matrix_base<NumericT> & mat,
+                       const vector_base<NumericT> & vec,
+                       viennacl::linalg::unit_lower_tag const & tag)
+{
+  // run an inplace solve on the result vector:
+  vector<NumericT> result(vec);
+  inplace_solve(mat, result, tag);
+  return result;
+}
+
+/** @brief Convenience functions for result = solve(trans(mat), vec, some_tag()); Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param proxy  The transposed system matrix proxy
+* @param vec    The load vector, where the solution is directly written to
+* @param tag    Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+vector<NumericT> solve(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy,
+                       const vector_base<NumericT> & vec,
+                       SolverTagT const & tag)
+{
+  // run an inplace solve on the result vector:
+  vector<NumericT> result(vec);
+  inplace_solve(proxy, result, tag);
+  return result;
+}
+
+
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp
new file mode 100644
index 0000000..36be3b3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp
@@ -0,0 +1,29 @@
+#ifndef VIENNACL_LINALG_EIG_HPP_
+#define VIENNACL_LINALG_EIG_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/eig.hpp
+*   @brief Convenience header file including all available eigenvalue algorithms
+*/
+
+#include "viennacl/linalg/bisect.hpp"
+#include "viennacl/linalg/lanczos.hpp"
+#include "viennacl/linalg/power_iter.hpp"
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp
new file mode 100644
index 0000000..ae9ade2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp
@@ -0,0 +1,481 @@
+#ifndef VIENNACL_LINALG_FFT_OPERATIONS_HPP_
+#define VIENNACL_LINALG_FFT_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+   Institute for Analysis and Scientific Computing,
+   TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+   -----------------
+   ViennaCL - The Vienna Computing Library
+   -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+
+/**  @file viennacl/linalg/fft_operations.hpp
+ @brief Implementations of Fast Furier Transformation.
+ */
+
+#include <viennacl/vector.hpp>
+#include <viennacl/matrix.hpp>
+
+#include "viennacl/linalg/host_based/fft_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/linalg/opencl/fft_operations.hpp"
+#include "viennacl/linalg/opencl/kernels/fft.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+#include "viennacl/linalg/cuda/fft_operations.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/**
+ * @brief Direct 1D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::vector<NumericT, AlignmentV> const & in,
+            viennacl::vector<NumericT, AlignmentV>       & out, vcl_size_t size, vcl_size_t stride,
+            vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::direct(in, out, size, stride, batch_num, sign, data_order);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::direct(viennacl::traits::opencl_handle(in), viennacl::traits::opencl_handle(out), size, stride, batch_num, sign,data_order);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::direct(in, out, size, stride, batch_num,sign,data_order);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+
+  }
+}
+
+/**
+ * @brief Direct 2D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in,
+            viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& out, vcl_size_t size,
+            vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::direct(in, out, size, stride, batch_num, sign, data_order);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::direct(viennacl::traits::opencl_handle(in), viennacl::traits::opencl_handle(out), size, stride, batch_num, sign,data_order);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::direct(in, out, size, stride, batch_num,sign,data_order);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+
+  }
+}
+
+/*
+ * This function performs reorder of input data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void reorder(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
+             vcl_size_t bits_datasize, vcl_size_t batch_num,
+             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::reorder(in, size, stride, bits_datasize, batch_num, data_order);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::reorder<NumericT>(viennacl::traits::opencl_handle(in), size, stride, bits_datasize, batch_num, data_order);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::reorder(in, size, stride, bits_datasize, batch_num, data_order);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+
+  }
+}
+
+/**
+ * @brief Radix-2 1D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & in, vcl_size_t size,
+            vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::radix2(in, size, stride, batch_num, sign, data_order);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::radix2(viennacl::traits::opencl_handle(in), size, stride, batch_num, sign,data_order);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::radix2(in, size, stride, batch_num, sign, data_order);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Radix-2 2D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
+            vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::radix2(in, size, stride, batch_num, sign, data_order);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::radix2(viennacl::traits::opencl_handle(in), size, stride, batch_num, sign,data_order);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::radix2(in, size, stride, batch_num, sign,data_order);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Bluestein's algorithm for computing Fourier transformation.
+ *
+ * Currently,  Works only for sizes of input data which less than 2^16.
+ * Uses a lot of additional memory, but should be fast for any size of data.
+ * Serial implementation has something about o(n * lg n) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void bluestein(viennacl::vector<NumericT, AlignmentV> & in,
+               viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t /*batch_num*/)
+{
+
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::bluestein(in, out, 1);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::bluestein(in, out, 1);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::bluestein(in, out, 1);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Mutiply two complex vectors and store result in output
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
+                      viennacl::vector<NumericT, AlignmentV> const & input2,
+                      viennacl::vector<NumericT, AlignmentV>       & output)
+{
+  switch (viennacl::traits::handle(input1).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::multiply_complex(input1, input2, output);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::multiply_complex(input1, input2, output);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::multiply_complex(input1, input2, output);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Normalize vector on with his own size
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void normalize(viennacl::vector<NumericT, AlignmentV> & input)
+{
+  switch (viennacl::traits::handle(input).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::normalize(input);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::normalize(input);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::normalize(input);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Inplace_transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
+{
+  switch (viennacl::traits::handle(input).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::transpose(input);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::transpose(input);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::transpose(input);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
+               viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>       & output)
+{
+  switch (viennacl::traits::handle(input).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::transpose(input, output);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::transpose(input, output);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::transpose(input, output);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void real_to_complex(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT>       & out, vcl_size_t size)
+{
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::real_to_complex(in, out, size);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::real_to_complex(in,out,size);
+      break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+      viennacl::linalg::cuda::real_to_complex(in,out,size);
+      break;
+#endif
+
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void complex_to_real(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT>       & out, vcl_size_t size)
+{
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::complex_to_real(in, out, size);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::complex_to_real(in, out, size);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::complex_to_real(in, out, size);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Reverse vector to oposite order and save it in input vector
+ */
+template<typename NumericT>
+void reverse(viennacl::vector_base<NumericT> & in)
+{
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::reverse(in);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::reverse(in);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::reverse(in);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+}
+}
+
+#endif /* FFT_OPERATIONS_HPP_ */


[33/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp
new file mode 100644
index 0000000..24cb4a6
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp
@@ -0,0 +1,2887 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/cuda/matrix_operations_prod.hpp
+    @brief Dense matrix-matrix product CUDA kernels reside here.
+
+    Note: File created semi-automatically from OpenCL kernels.
+*/
+
+#include "viennacl/forwards.h"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+// matrix-matrix multiplication C = A * B
+// matrix layouts: C...col_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_col_prod_AA_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+  vcl_size_t bStep = block_size * B_row_inc;
+  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A * B^T
+// matrix layouts: C...col_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_col_prod_AT_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A^T * B
+// matrix layouts: C...col_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_col_prod_TA_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+  vcl_size_t aStep = block_size * A_row_inc;
+  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+  vcl_size_t bStep = block_size * B_row_inc;
+  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A^T * B^T
+// matrix layouts: C...col_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_col_prod_TT_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+  vcl_size_t aStep = block_size * A_row_inc;
+  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////
+
+
+
+
+// matrix-matrix multiplication C = A * B
+// matrix layouts: C...row_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_col_prod_AA_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+  vcl_size_t bStep = block_size * B_row_inc;
+  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+// matrix-matrix multiplication C = A * B^T
+// matrix layouts: C...row_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_col_prod_AT_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+// matrix-matrix multiplication C = A^T * B
+// matrix layouts: C...row_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_col_prod_TA_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+  vcl_size_t aStep = block_size * A_row_inc;
+  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+  vcl_size_t bStep = block_size * B_row_inc;
+  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+// matrix-matrix multiplication C = A^T * B^T
+// matrix layouts: C...row_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_col_prod_TT_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+  vcl_size_t aStep = block_size * A_row_inc;
+  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+
+
+
+////////////////////////////////////////////////////////////////////////////
+
+
+
+
+// matrix-matrix multiplication C = A * B
+// matrix layouts: C...col_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_row_prod_AA_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A * B^T
+// matrix layouts: C...col_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_row_prod_AT_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+  vcl_size_t bStep = block_size * B_col_inc;
+  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A^T * B
+// matrix layouts: C...col_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_row_prod_TA_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+  vcl_size_t aStep = block_size * A_row_inc;
+  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A^T * B^T
+// matrix layouts: C...col_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_row_prod_TT_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+  vcl_size_t aStep = block_size * A_row_inc;
+  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+  vcl_size_t bStep = block_size * B_col_inc;
+  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////
+
+
+
+
+// matrix-matrix multiplication C = A * B
+// matrix layouts: C...row_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_row_prod_AA_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+// matrix-matrix multiplication C = A * B^T
+// matrix layouts: C...row_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_row_prod_AT_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+  vcl_size_t bStep = block_size * B_col_inc;
+  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+// matrix-matrix multiplication C = A^T * B
+// matrix layouts: C...row_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_row_prod_TA_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+  vcl_size_t aStep = block_size * A_row_inc;
+  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+// matrix-matrix multiplication C = A^T * B^T
+// matrix layouts: C...row_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_row_prod_TT_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+  vcl_size_t aStep = block_size * A_row_inc;
+  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+  vcl_size_t bStep = block_size * B_col_inc;
+  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+
+
+
+
+////////////////////////////////////////////////////////////////////////////
+
+
+
+
+
+
+// matrix-matrix multiplication C = A * B
+// matrix layouts: C...col_major, A...row_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_row_col_prod_AA_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
+  vcl_size_t aStep = block_size * A_col_inc;
+  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+  vcl_size_t bStep = block_size * B_row_inc;
+  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A * B^T
+// matrix layouts: C...col_major, A...row_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_row_col_prod_AT_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
+  vcl_size_t aStep = block_size * A_col_inc;
+  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A^T * B
+// matrix layouts: C...col_major, A...row_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_row_col_prod_TA_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
+  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
+  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+  vcl_size_t bStep = block_size * B_row_inc;
+  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+  NumericT Csub = 0;
+  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+  for (vcl_size_t block = 0;
+          block < block_num;
+          ++block)
+  {
+    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+    __syncthreads();
+    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+    __syncthreads();
+    aBegin += aStep;
+    bBegin += bStep;
+  }
+  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A^T * B^T
+// matrix layouts: C...col_major, A...row_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_row_col_prod_TT_kernel(
+          NumericT alpha,
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_inc,
+          unsigned int B_col_inc,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          NumericT beta,
+          NumericT * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_inc,
+          unsigned int C_col_inc,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols)
+{
+
+  __shared__ NumericT bufA[272];
+  __shared__ NumericT bufB[272];
+
+  vcl_size_t block_size = 16;//get_local_size(0);
+  vcl_size_t row_block_id = blockIdx.x;
+  vcl_size_t col_block_id = blockIdx.y;
+  vcl_size_t row_thread_id = threadIdx.x;
+  vcl_size_t col_thread_id = threadIdx.y;
+  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
+  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
+  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+  vcl_size_t block_num = (A_row_size + block_size - 1) / 

<TRUNCATED>

[39/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp
new file mode 100644
index 0000000..65b323e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp
@@ -0,0 +1,821 @@
+#ifndef VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP
+#define VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file cuda/amg_operations.hpp
+    @brief Implementations of routines for AMG in OpenCL.
+*/
+
+#include <cstdlib>
+#include <cmath>
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+
+#include <map>
+#include <set>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+namespace amg
+{
+
+
+///////////////////////////////////////////
+
+__global__ void amg_influence_trivial_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          unsigned int size1,
+          unsigned int nnz,
+          unsigned int *influences_row,
+          unsigned int *influences_id,
+          unsigned int *influences_values
+          )
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size1; i += global_size)
+  {
+    unsigned int tmp = row_indices[i];
+    influences_row[i] = tmp;
+    influences_values[i] = row_indices[i+1] - tmp;
+  }
+
+  for (unsigned int i = global_id; i < nnz; i += global_size)
+    influences_id[i] = column_indices[i];
+
+  if (global_id == 0)
+    influences_row[size1] = row_indices[size1];
+}
+
+
+/** @brief Routine for taking all connections in the matrix as strong */
+template<typename NumericT>
+void amg_influence_trivial(compressed_matrix<NumericT> const & A,
+                           viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                           viennacl::linalg::amg_tag & tag)
+{
+  (void)tag;
+
+  amg_influence_trivial_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
+                                             viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
+                                             static_cast<unsigned int>(A.size1()),
+                                             static_cast<unsigned int>(A.nnz()),
+                                             viennacl::cuda_arg(amg_context.influence_jumper_),
+                                             viennacl::cuda_arg(amg_context.influence_ids_),
+                                             viennacl::cuda_arg(amg_context.influence_values_)
+                                            );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_influence_trivial_kernel");
+}
+
+
+/** @brief Routine for extracting strongly connected points considering a user-provided threshold value */
+template<typename NumericT>
+void amg_influence_advanced(compressed_matrix<NumericT> const & A,
+                            viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                            viennacl::linalg::amg_tag & tag)
+{
+  throw std::runtime_error("not implemented yet");
+}
+
+/** @brief Dispatcher for influence processing */
+template<typename NumericT>
+void amg_influence(compressed_matrix<NumericT> const & A,
+                   viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                   viennacl::linalg::amg_tag & tag)
+{
+  // TODO: dispatch based on influence tolerance provided
+  amg_influence_trivial(A, amg_context, tag);
+}
+
+/** @brief Assign IDs to coarse points.
+*
+*  TODO: Use exclusive_scan on GPU for this.
+*/
+inline void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & amg_context)
+{
+  viennacl::backend::typesafe_host_array<unsigned int> point_types(amg_context.point_types_.handle(), amg_context.point_types_.size());
+  viennacl::backend::typesafe_host_array<unsigned int> coarse_ids(amg_context.coarse_id_.handle(),    amg_context.coarse_id_.size());
+  viennacl::backend::memory_read(amg_context.point_types_.handle(), 0, point_types.raw_size(), point_types.get());
+  viennacl::backend::memory_read(amg_context.coarse_id_.handle(),   0, coarse_ids.raw_size(),  coarse_ids.get());
+
+  unsigned int coarse_id = 0;
+  for (std::size_t i=0; i<amg_context.point_types_.size(); ++i)
+  {
+    coarse_ids.set(i, coarse_id);
+    if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+      ++coarse_id;
+  }
+
+  amg_context.num_coarse_ = coarse_id;
+
+  viennacl::backend::memory_write(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get());
+}
+
+//////////////////////////////////////
+
+/** @brief CUDA kernel initializing the work vectors at each PMIS iteration */
+template<typename IndexT>
+__global__ void amg_pmis2_init_workdata(IndexT *work_state,
+                                        IndexT *work_random,
+                                        IndexT *work_index,
+                                        IndexT const *point_types,
+                                        IndexT const *random_weights,
+                                        unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    switch (point_types[i])
+    {
+    case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED: work_state[i] = 1; break;
+    case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE:      work_state[i] = 0; break;
+    case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE:    work_state[i] = 2; break;
+    default:
+      break; // do nothing
+    }
+
+    work_random[i] = random_weights[i];
+    work_index[i]  = i;
+  }
+}
+
+/** @brief CUDA kernel propagating the state triple (status, weight, nodeID) to neighbors using a max()-operation */
+template<typename IndexT>
+__global__ void amg_pmis2_max_neighborhood(IndexT const *work_state,
+                                           IndexT const *work_random,
+                                           IndexT const *work_index,
+                                           IndexT       *work_state2,
+                                           IndexT       *work_random2,
+                                           IndexT       *work_index2,
+                                           IndexT const *influences_row,
+                                           IndexT const *influences_id,
+                                           unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    // load
+    unsigned int state  = work_state[i];
+    unsigned int random = work_random[i];
+    unsigned int index  = work_index[i];
+
+    // max
+    unsigned int j_stop = influences_row[i + 1];
+    for (unsigned int j = influences_row[i]; j < j_stop; ++j)
+    {
+      unsigned int influenced_point_id = influences_id[j];
+
+      // lexigraphical triple-max (not particularly pretty, but does the job):
+      if (state < work_state[influenced_point_id])
+      {
+        state  = work_state[influenced_point_id];
+        random = work_random[influenced_point_id];
+        index  = work_index[influenced_point_id];
+      }
+      else if (state == work_state[influenced_point_id])
+      {
+        if (random < work_random[influenced_point_id])
+        {
+          state  = work_state[influenced_point_id];
+          random = work_random[influenced_point_id];
+          index  = work_index[influenced_point_id];
+        }
+        else if (random == work_random[influenced_point_id])
+        {
+          if (index < work_index[influenced_point_id])
+          {
+            state  = work_state[influenced_point_id];
+            random = work_random[influenced_point_id];
+            index  = work_index[influenced_point_id];
+          }
+        } // max(random)
+      } // max(state)
+    } // for
+
+    // store
+    work_state2[i]  = state;
+    work_random2[i] = random;
+    work_index2[i]  = index;
+  }
+}
+
+/** @brief CUDA kernel for marking MIS and non-MIS nodes */
+template<typename IndexT>
+__global__ void amg_pmis2_mark_mis_nodes(IndexT const *work_state,
+                                         IndexT const *work_index,
+                                         IndexT *point_types,
+                                         IndexT *undecided_buffer,
+                                         unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  unsigned int num_undecided = 0;
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    unsigned int max_state  = work_state[i];
+    unsigned int max_index  = work_index[i];
+
+    if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+    {
+      if (i == max_index) // make this a MIS node
+        point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE;
+      else if (max_state == 2) // mind the mapping of viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE above!
+        point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+      else
+        num_undecided += 1;
+    }
+  }
+
+  // reduction of the number of undecided nodes:
+  __shared__ unsigned int shared_buffer[256];
+  shared_buffer[threadIdx.x] = num_undecided;
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+      shared_buffer[threadIdx.x] += shared_buffer[threadIdx.x+stride];
+  }
+
+  if (threadIdx.x == 0)
+    undecided_buffer[blockIdx.x] = shared_buffer[0];
+
+}
+
+/** @brief CUDA kernel for resetting non-MIS (i.e. coarse) points to undecided so that subsequent kernels work */
+__global__ void amg_pmis2_reset_state(unsigned int *point_types, unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    if (point_types[i] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+      point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED;
+  }
+}
+
+/** @brief AG (aggregation based) coarsening, single-threaded version of stage 1
+*
+* @param A             Operator matrix on all levels
+* @param amg_context   AMG hierarchy datastructures
+* @param tag           AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A,
+                               viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                               viennacl::linalg::amg_tag & tag)
+{
+  viennacl::vector<unsigned int> random_weights(A.size1(), viennacl::context(viennacl::MAIN_MEMORY));
+  unsigned int *random_weights_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(random_weights.handle());
+  for (std::size_t i=0; i<random_weights.size(); ++i)
+    random_weights_ptr[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.size1());
+  random_weights.switch_memory_context(viennacl::traits::context(A));
+
+  // work vectors:
+  viennacl::vector<unsigned int> work_state(A.size1(),  viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_random(A.size1(), viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_index(A.size1(),  viennacl::traits::context(A));
+
+  viennacl::vector<unsigned int> work_state2(A.size1(),  viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_random2(A.size1(), viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_index2(A.size1(),  viennacl::traits::context(A));
+
+  unsigned int num_undecided = static_cast<unsigned int>(A.size1());
+  viennacl::vector<unsigned int> undecided_buffer(256, viennacl::traits::context(A));
+  viennacl::backend::typesafe_host_array<unsigned int> undecided_buffer_host(undecided_buffer.handle(), undecided_buffer.size());
+
+  unsigned int pmis_iters = 0;
+  while (num_undecided > 0)
+  {
+    ++pmis_iters;
+
+    //
+    // init temporary work data:
+    //
+    amg_pmis2_init_workdata<<<128, 128>>>(viennacl::cuda_arg(work_state),
+                                          viennacl::cuda_arg(work_random),
+                                          viennacl::cuda_arg(work_index),
+                                          viennacl::cuda_arg(amg_context.point_types_),
+                                          viennacl::cuda_arg(random_weights),
+                                          static_cast<unsigned int>(A.size1())
+                                         );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
+
+
+    //
+    // Propagate maximum tuple twice
+    //
+    for (unsigned int r = 0; r < 2; ++r)
+    {
+      // max operation over neighborhood
+      amg_pmis2_max_neighborhood<<<128, 128>>>(viennacl::cuda_arg(work_state),
+                                               viennacl::cuda_arg(work_random),
+                                               viennacl::cuda_arg(work_index),
+                                               viennacl::cuda_arg(work_state2),
+                                               viennacl::cuda_arg(work_random2),
+                                               viennacl::cuda_arg(work_index2),
+                                               viennacl::cuda_arg(amg_context.influence_jumper_),
+                                               viennacl::cuda_arg(amg_context.influence_ids_),
+                                               static_cast<unsigned int>(A.size1())
+                                              );
+      VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_max_neighborhood");
+
+      // copy work array (can be fused into a single kernel if needed. Previous kernel is in most cases sufficiently heavy)
+      work_state  = work_state2;
+      work_random = work_random2;
+      work_index  = work_index2;
+    }
+
+    //
+    // mark MIS and non-MIS nodes:
+    //
+    amg_pmis2_mark_mis_nodes<<<128, 128>>>(viennacl::cuda_arg(work_state),
+                                           viennacl::cuda_arg(work_index),
+                                           viennacl::cuda_arg(amg_context.point_types_),
+                                           viennacl::cuda_arg(undecided_buffer),
+                                           static_cast<unsigned int>(A.size1())
+                                          );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
+
+    // get number of undecided points on host:
+    viennacl::backend::memory_read(undecided_buffer.handle(),   0, undecided_buffer_host.raw_size(),  undecided_buffer_host.get());
+    num_undecided = 0;
+    for (std::size_t i=0; i<undecided_buffer.size(); ++i)
+      num_undecided += undecided_buffer_host[i];
+
+  } //while
+
+  // consistency with sequential MIS: reset state for non-coarse points, so that coarse indices are correctly picked up later
+  amg_pmis2_reset_state<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_),
+                                      static_cast<unsigned int>(amg_context.point_types_.size())
+                                     );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
+}
+
+
+
+
+
+template<typename IndexT>
+__global__ void amg_agg_propagate_coarse_indices(IndexT       *point_types,
+                                                 IndexT       *coarse_ids,
+                                                 IndexT const *influences_row,
+                                                 IndexT const *influences_id,
+                                                 unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+    {
+      unsigned int coarse_index = coarse_ids[i];
+
+      unsigned int j_stop = influences_row[i + 1];
+      for (unsigned int j = influences_row[i]; j < j_stop; ++j)
+      {
+        unsigned int influenced_point_id = influences_id[j];
+        coarse_ids[influenced_point_id] = coarse_index; // Set aggregate index for fine point
+
+        if (influenced_point_id != i) // Note: Any write races between threads are harmless here
+          point_types[influenced_point_id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+      }
+    }
+  }
+}
+
+
+template<typename IndexT>
+__global__ void amg_agg_merge_undecided(IndexT       *point_types,
+                                        IndexT       *coarse_ids,
+                                        IndexT const *influences_row,
+                                        IndexT const *influences_id,
+                                        unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+    {
+      unsigned int j_stop = influences_row[i + 1];
+      for (unsigned int j = influences_row[i]; j < j_stop; ++j)
+      {
+        unsigned int influenced_point_id = influences_id[j];
+        if (point_types[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // either coarse or fine point
+        {
+          //std::cout << "Setting fine node " << i << " to be aggregated with node " << *influence_iter << "/" << pointvector.get_coarse_index(*influence_iter) << std::endl;
+          coarse_ids[i] = coarse_ids[influenced_point_id];
+          break;
+        }
+      }
+    }
+  }
+}
+
+
+__global__ void amg_agg_merge_undecided_2(unsigned int *point_types,
+                                          unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+      point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+  }
+}
+
+
+/** @brief AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG)
+*
+* @param A             Operator matrix
+* @param amg_context   AMG hierarchy datastructures
+* @param tag           AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag(compressed_matrix<NumericT> const & A,
+                   viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                   viennacl::linalg::amg_tag & tag)
+{
+
+  amg_influence_trivial(A, amg_context, tag);
+
+  //
+  // Stage 1: Build aggregates:
+  //
+  if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION)
+    amg_coarse_ag_stage1_mis2(A, amg_context, tag);
+  else
+    throw std::runtime_error("Only MIS2 coarsening implemented. Selected coarsening not available with CUDA backend!");
+
+  viennacl::linalg::cuda::amg::enumerate_coarse_points(amg_context);
+
+  //
+  // Stage 2: Propagate coarse aggregate indices to neighbors:
+  //
+  amg_agg_propagate_coarse_indices<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_),
+                                                 viennacl::cuda_arg(amg_context.coarse_id_),
+                                                 viennacl::cuda_arg(amg_context.influence_jumper_),
+                                                 viennacl::cuda_arg(amg_context.influence_ids_),
+                                                 static_cast<unsigned int>(A.size1())
+                                                );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_propagate_coarse_indices");
+
+
+  //
+  // Stage 3: Merge remaining undecided points (merging to first aggregate found when cycling over the hierarchy
+  //
+  amg_agg_merge_undecided<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_),
+                                        viennacl::cuda_arg(amg_context.coarse_id_),
+                                        viennacl::cuda_arg(amg_context.influence_jumper_),
+                                        viennacl::cuda_arg(amg_context.influence_ids_),
+                                        static_cast<unsigned int>(A.size1())
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided");
+
+  //
+  // Stage 4: Set undecided points to fine points (coarse ID already set in Stage 3)
+  //          Note: Stage 3 and Stage 4 were initially fused, but are now split in order to avoid race conditions (or a fallback to sequential execution).
+  //
+  amg_agg_merge_undecided_2<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_),
+                                         static_cast<unsigned int>(A.size1())
+                                        );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided_2");
+}
+
+
+
+
+/** @brief Calls the right coarsening procedure
+*
+* @param A            Operator matrix on all levels
+* @param amg_context  AMG hierarchy datastructures
+* @param tag          AMG preconditioner tag
+*/
+template<typename InternalT1>
+void amg_coarse(InternalT1 & A,
+                viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                viennacl::linalg::amg_tag & tag)
+{
+  switch (tag.get_coarsening_method())
+  {
+  case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: amg_coarse_ag(A, amg_context, tag); break;
+  default: throw std::runtime_error("not implemented yet");
+  }
+}
+
+
+
+
+////////////////////////////////////// Interpolation /////////////////////////////
+
+template<typename NumericT>
+__global__ void amg_interpol_ag_kernel(unsigned int *P_row_buffer,
+                                       unsigned int *P_col_buffer,
+                                       NumericT *P_elements,
+                                       unsigned int *coarse_ids,
+                                       unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    P_row_buffer[i] = i;
+    P_col_buffer[i] = coarse_ids[i];
+    P_elements[i]   = NumericT(1);
+  }
+
+  // set last entry as well:
+  if (global_id == 0)
+    P_row_buffer[size] = size;
+}
+
+/** @brief AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA)
+ *
+ * @param A            Operator matrix
+ * @param P            Prolongation matrix
+ * @param amg_context  AMG hierarchy datastructures
+ * @param tag          AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_ag(compressed_matrix<NumericT> const & A,
+                     compressed_matrix<NumericT> & P,
+                     viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                     viennacl::linalg::amg_tag & tag)
+{
+  (void)tag;
+  P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+  amg_interpol_ag_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(P.handle1().cuda_handle()),
+                                       viennacl::cuda_arg<unsigned int>(P.handle2().cuda_handle()),
+                                       viennacl::cuda_arg<NumericT>(P.handle().cuda_handle()),
+                                       viennacl::cuda_arg(amg_context.coarse_id_),
+                                       static_cast<unsigned int>(A.size1())
+                                      );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_ag_kernel");
+
+  P.generate_row_block_information();
+}
+
+
+
+template<typename NumericT>
+__global__ void amg_interpol_sa_kernel(
+          const unsigned int *A_row_indices,
+          const unsigned int *A_col_indices,
+          const NumericT     *A_elements,
+          unsigned int A_size1,
+          unsigned int A_nnz,
+          unsigned int *Jacobi_row_indices,
+          unsigned int *Jacobi_col_indices,
+          NumericT     *Jacobi_elements,
+          NumericT     omega
+          )
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int row = global_id; row < A_size1; row += global_size)
+  {
+    unsigned int row_begin = A_row_indices[row];
+    unsigned int row_end   = A_row_indices[row+1];
+
+    Jacobi_row_indices[row] = row_begin;
+
+    // Step 1: Extract diagonal:
+    NumericT diag = 0;
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      if (A_col_indices[j] == row)
+      {
+        diag = A_elements[j];
+        break;
+      }
+    }
+
+    // Step 2: Write entries:
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      unsigned int col_index = A_col_indices[j];
+      Jacobi_col_indices[j] = col_index;
+
+      if (col_index == row)
+        Jacobi_elements[j] = NumericT(1) - omega;
+      else
+        Jacobi_elements[j] = - omega * A_elements[j] / diag;
+    }
+  }
+
+  if (global_id == 0)
+    Jacobi_row_indices[A_size1] = A_nnz; // don't forget finalizer
+}
+
+
+
+/** @brief Smoothed aggregation interpolation. (VIENNACL_INTERPOL_SA)
+ *
+ * @param A            Operator matrix
+ * @param P            Prolongation matrix
+ * @param amg_context  AMG hierarchy datastructures
+ * @param tag          AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_sa(compressed_matrix<NumericT> const & A,
+                     compressed_matrix<NumericT> & P,
+                     viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                     viennacl::linalg::amg_tag & tag)
+{
+  (void)tag;
+  viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+  // form tentative operator:
+  amg_interpol_ag(A, P_tentative, amg_context, tag);
+
+  viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), viennacl::traits::context(A));
+
+  amg_interpol_sa_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
+                                       viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
+                                       viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
+                                       static_cast<unsigned int>(A.size1()),
+                                       static_cast<unsigned int>(A.nnz()),
+                                       viennacl::cuda_arg<unsigned int>(Jacobi.handle1().cuda_handle()),
+                                       viennacl::cuda_arg<unsigned int>(Jacobi.handle2().cuda_handle()),
+                                       viennacl::cuda_arg<NumericT>(Jacobi.handle().cuda_handle()),
+                                       NumericT(tag.get_jacobi_weight())
+                                      );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_sa_kernel");
+
+  P = viennacl::linalg::prod(Jacobi, P_tentative);
+
+  P.generate_row_block_information();
+}
+
+
+/** @brief Dispatcher for building the interpolation matrix
+ *
+ * @param A            Operator matrix
+ * @param P            Prolongation matrix
+ * @param amg_context  AMG hierarchy datastructures
+ * @param tag          AMG configuration tag
+*/
+template<typename MatrixT>
+void amg_interpol(MatrixT const & A,
+                  MatrixT & P,
+                  viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                  viennacl::linalg::amg_tag & tag)
+{
+  switch (tag.get_interpolation_method())
+  {
+  case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION:          amg_interpol_ag     (A, P, amg_context, tag); break;
+  case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION: amg_interpol_sa     (A, P, amg_context, tag); break;
+  default: throw std::runtime_error("Not implemented yet!");
+  }
+}
+
+
+template<typename NumericT>
+__global__ void compressed_matrix_assign_to_dense(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          NumericT *B,
+          unsigned int B_row_start,     unsigned int B_col_start,
+          unsigned int B_row_inc,       unsigned int B_col_inc,
+          unsigned int B_row_size,      unsigned int B_col_size,
+          unsigned int B_internal_rows, unsigned int B_internal_cols)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < B_row_size;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_end = row_indices[row+1];
+    for (unsigned int j = row_indices[row]; j<row_end; j++)
+      B[(B_row_start + row * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j];
+  }
+}
+
+
+template<typename NumericT, unsigned int AlignmentV>
+void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+                     viennacl::matrix_base<NumericT> & B)
+{
+  compressed_matrix_assign_to_dense<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
+                                                  viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
+                                                  viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
+                                                  viennacl::cuda_arg<NumericT>(B),
+                                                  static_cast<unsigned int>(viennacl::traits::start1(B)),           static_cast<unsigned int>(viennacl::traits::start2(B)),
+                                                  static_cast<unsigned int>(viennacl::traits::stride1(B)),          static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                                                  static_cast<unsigned int>(viennacl::traits::size1(B)),            static_cast<unsigned int>(viennacl::traits::size2(B)),
+                                                  static_cast<unsigned int>(viennacl::traits::internal_size1(B)),   static_cast<unsigned int>(viennacl::traits::internal_size2(B))
+                                                 );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_assign_to_dense");
+}
+
+
+
+
+template<typename NumericT>
+__global__ void compressed_matrix_smooth_jacobi_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          NumericT weight,
+          const NumericT * x_old,
+          NumericT * x_new,
+          const NumericT * rhs,
+          unsigned int size)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < size;
+                    row += gridDim.x * blockDim.x)
+  {
+    NumericT sum = NumericT(0);
+    NumericT diag = NumericT(1);
+    unsigned int row_end = row_indices[row+1];
+    for (unsigned int j = row_indices[row]; j < row_end; ++j)
+    {
+      unsigned int col = column_indices[j];
+      if (col == row)
+        diag = elements[j];
+      else
+        sum += elements[j] * x_old[col];
+    }
+    x_new[row] = weight * (rhs[row] - sum) / diag + (NumericT(1) - weight) * x_old[row];
+  }
+}
+
+
+
+
+/** @brief Damped Jacobi Smoother (CUDA version)
+*
+* @param iterations  Number of smoother iterations
+* @param A           Operator matrix for the smoothing
+* @param x           The vector smoothing is applied to
+* @param x_backup    (Different) Vector holding the same values as x
+* @param rhs_smooth  The right hand side of the equation for the smoother
+* @param weight      Damping factor. 0: No effect of smoother. 1: Undamped Jacobi iteration
+*/
+template<typename NumericT>
+void smooth_jacobi(unsigned int iterations,
+                   compressed_matrix<NumericT> const & A,
+                   vector<NumericT> & x,
+                   vector<NumericT> & x_backup,
+                   vector<NumericT> const & rhs_smooth,
+                   NumericT weight)
+{
+  for (unsigned int i=0; i<iterations; ++i)
+  {
+    x_backup = x;
+
+    compressed_matrix_smooth_jacobi_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
+                                                         viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
+                                                         viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
+                                                         static_cast<NumericT>(weight),
+                                                         viennacl::cuda_arg(x_backup),
+                                                         viennacl::cuda_arg(x),
+                                                         viennacl::cuda_arg(rhs_smooth),
+                                                         static_cast<unsigned int>(rhs_smooth.size())
+                                                        );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_smooth_jacobi_kernel");
+  }
+}
+
+
+} //namespace amg
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp
new file mode 100644
index 0000000..39f0015
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp
@@ -0,0 +1,166 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_CALLS_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_CALLS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_calls.hpp
+    @brief CUDA kernel calls for the bisection algorithm
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+
+// includes, kernels
+#include "viennacl/linalg/cuda/bisect_kernel_small.hpp"
+#include "viennacl/linalg/cuda/bisect_kernel_large.hpp"
+#include "viennacl/linalg/cuda/bisect_kernel_large_onei.hpp"
+#include "viennacl/linalg/cuda/bisect_kernel_large_multi.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+template<typename NumericT>
+void bisectSmall(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataSmall<NumericT> &result,
+                       const unsigned int mat_size,
+                       const NumericT lg, const NumericT ug,
+                       const NumericT precision)
+{
+
+
+  dim3  blocks(1, 1, 1);
+  dim3  threads(VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX, 1, 1);
+
+  bisectKernelSmall<<< blocks, threads >>>(
+    viennacl::cuda_arg(input.g_a),
+    viennacl::cuda_arg(input.g_b) + 1,
+    mat_size,
+    viennacl::cuda_arg(result.vcl_g_left),
+    viennacl::cuda_arg(result.vcl_g_right),
+    viennacl::cuda_arg(result.vcl_g_left_count),
+    viennacl::cuda_arg(result.vcl_g_right_count),
+    lg, ug, 0, mat_size,
+    precision
+    );
+  viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("Kernel launch failed");
+}
+
+
+template<typename NumericT>
+void bisectLarge(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+                   const unsigned int mat_size,
+                   const NumericT lg, const NumericT ug,
+                   const NumericT precision)
+ {
+
+  dim3  blocks(1, 1, 1);
+  dim3  threads(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2 , 1, 1);
+  bisectKernelLarge<<< blocks, threads >>>
+    (viennacl::cuda_arg(input.g_a),
+     viennacl::cuda_arg(input.g_b) + 1,
+     mat_size,
+     lg, ug, static_cast<unsigned int>(0), mat_size, precision,
+     viennacl::cuda_arg(result.g_num_one),
+     viennacl::cuda_arg(result.g_num_blocks_mult),
+     viennacl::cuda_arg(result.g_left_one),
+     viennacl::cuda_arg(result.g_right_one),
+     viennacl::cuda_arg(result.g_pos_one),
+     viennacl::cuda_arg(result.g_left_mult),
+     viennacl::cuda_arg(result.g_right_mult),
+     viennacl::cuda_arg(result.g_left_count_mult),
+     viennacl::cuda_arg(result.g_right_count_mult),
+     viennacl::cuda_arg(result.g_blocks_mult),
+     viennacl::cuda_arg(result.g_blocks_mult_sum)
+     );
+  viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("Kernel launch failed.");
+}
+
+
+// compute eigenvalues for intervals that contained only one eigenvalue
+// after the first processing step
+template<typename NumericT>
+void bisectLarge_OneIntervals(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+                   const unsigned int mat_size,
+                   const NumericT precision)
+ {
+
+  unsigned int num_one_intervals = result.g_num_one;
+  unsigned int num_blocks = viennacl::linalg::detail::getNumBlocksLinear(num_one_intervals,
+                                                                         mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);
+  dim3 grid_onei;
+  grid_onei.x = num_blocks;
+  grid_onei.y = 1, grid_onei.z = 1;
+  dim3 threads_onei(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2, 1, 1);
+
+
+  bisectKernelLarge_OneIntervals<<< grid_onei , threads_onei >>>
+    (viennacl::cuda_arg(input.g_a),
+     viennacl::cuda_arg(input.g_b) + 1,
+     mat_size, num_one_intervals,
+     viennacl::cuda_arg(result.g_left_one),
+     viennacl::cuda_arg(result.g_right_one),
+     viennacl::cuda_arg(result.g_pos_one),
+     precision
+     );
+  viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("bisectKernelLarge_OneIntervals() FAILED.");
+}
+
+
+// process intervals that contained more than one eigenvalue after
+// the first processing step
+template<typename NumericT>
+void bisectLarge_MultIntervals(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+                   const unsigned int mat_size,
+                   const NumericT precision)
+ {
+    // get the number of blocks of intervals that contain, in total when
+    // each interval contains only one eigenvalue, not more than
+    // MAX_THREADS_BLOCK threads
+    unsigned int  num_blocks_mult = result.g_num_blocks_mult;
+
+    // setup the execution environment
+    dim3  grid_mult(num_blocks_mult, 1, 1);
+    dim3  threads_mult(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2, 1, 1);
+
+    bisectKernelLarge_MultIntervals<<< grid_mult, threads_mult >>>
+      (viennacl::cuda_arg(input.g_a),
+       viennacl::cuda_arg(input.g_b) + 1,
+       mat_size,
+       viennacl::cuda_arg(result.g_blocks_mult),
+       viennacl::cuda_arg(result.g_blocks_mult_sum),
+       viennacl::cuda_arg(result.g_left_mult),
+       viennacl::cuda_arg(result.g_right_mult),
+       viennacl::cuda_arg(result.g_left_count_mult),
+       viennacl::cuda_arg(result.g_right_count_mult),
+       viennacl::cuda_arg(result.g_lambda_mult),
+       viennacl::cuda_arg(result.g_pos_mult),
+       precision
+      );
+    viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("bisectKernelLarge_MultIntervals() FAILED.");
+}
+}
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp
new file mode 100755
index 0000000..77c9773
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp
@@ -0,0 +1,928 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_large.hpp
+    @brief First step of the bisection algorithm for the computation of eigenvalues.
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+/* Determine eigenvalues for large symmetric, tridiagonal matrix. First
+  step of the computation. */
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+// additional kernel
+#include "viennacl/linalg/cuda/bisect_util.hpp"
+
+// declaration, forward
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Write data to global memory
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+void writeToGmem(const unsigned int tid, const unsigned int tid_2,
+                 const unsigned int num_threads_active,
+                 const unsigned int num_blocks_mult,
+                 NumericT *g_left_one, NumericT *g_right_one,
+                 unsigned int *g_pos_one,
+                 NumericT *g_left_mult, NumericT *g_right_mult,
+                 unsigned int *g_left_count_mult,
+                 unsigned int *g_right_count_mult,
+                 NumericT *s_left, NumericT *s_right,
+                 unsigned short *s_left_count, unsigned short *s_right_count,
+                 unsigned int *g_blocks_mult,
+                 unsigned int *g_blocks_mult_sum,
+                 unsigned short *s_compaction_list,
+                 unsigned short *s_cl_helper,
+                 unsigned int offset_mult_lambda
+                )
+{
+
+    if (tid < offset_mult_lambda)
+    {
+
+        g_left_one[tid] = s_left[tid];
+        g_right_one[tid] = s_right[tid];
+        // right count can be used to order eigenvalues without sorting
+        g_pos_one[tid] = s_right_count[tid];
+    }
+    else
+    {
+
+
+        g_left_mult[tid - offset_mult_lambda] = s_left[tid];
+        g_right_mult[tid - offset_mult_lambda] = s_right[tid];
+        g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid];
+        g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid];
+    }
+
+    if (tid_2 < num_threads_active)
+    {
+
+        if (tid_2 < offset_mult_lambda)
+        {
+
+            g_left_one[tid_2] = s_left[tid_2];
+            g_right_one[tid_2] = s_right[tid_2];
+            // right count can be used to order eigenvalues without sorting
+            g_pos_one[tid_2] = s_right_count[tid_2];
+        }
+        else
+        {
+
+            g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2];
+            g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2];
+            g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2];
+            g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2];
+        }
+
+    } // end writing out data
+
+    __syncthreads();
+
+    // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out
+    // s_compaction_list we write the exclusive scan result
+    if (tid <= num_blocks_mult)
+    {
+        g_blocks_mult[tid] = s_compaction_list[tid];
+        g_blocks_mult_sum[tid] = s_cl_helper[tid];
+    }
+
+    if (tid_2 <= num_blocks_mult)
+    {
+        g_blocks_mult[tid_2] = s_compaction_list[tid_2];
+        g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2];
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform final stream compaction before writing data to global memory
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+void
+compactStreamsFinal(const unsigned int tid, const unsigned int tid_2,
+                    const unsigned int num_threads_active,
+                    unsigned int &offset_mult_lambda,
+                    NumericT *s_left, NumericT *s_right,
+                    unsigned short *s_left_count, unsigned short *s_right_count,
+                    unsigned short *s_cl_one, unsigned short *s_cl_mult,
+                    unsigned short *s_cl_blocking, unsigned short *s_cl_helper,
+                    unsigned int is_one_lambda, unsigned int is_one_lambda_2,
+                    NumericT &left, NumericT &right, NumericT &left_2, NumericT &right_2,
+                    unsigned int &left_count, unsigned int &right_count,
+                    unsigned int &left_count_2, unsigned int &right_count_2,
+                    unsigned int c_block_iend, unsigned int c_sum_block,
+                    unsigned int c_block_iend_2, unsigned int c_sum_block_2
+                   )
+{
+    // cache data before performing compaction
+    left = s_left[tid];
+    right = s_right[tid];
+
+    if (tid_2 < num_threads_active)
+    {
+
+        left_2 = s_left[tid_2];
+        right_2 = s_right[tid_2];
+    }
+
+    __syncthreads();
+
+    // determine addresses for intervals containing multiple eigenvalues and
+    // addresses for blocks of intervals
+    unsigned int ptr_w = 0;
+    unsigned int ptr_w_2 = 0;
+    unsigned int ptr_blocking_w = 0;
+    unsigned int ptr_blocking_w_2 = 0;
+
+
+
+    ptr_w = (1 == is_one_lambda) ? s_cl_one[tid]
+            : s_cl_mult[tid] + offset_mult_lambda;
+
+    if (0 != c_block_iend)
+    {
+        ptr_blocking_w = s_cl_blocking[tid];
+    }
+
+    if (tid_2 < num_threads_active)
+    {
+        ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2]
+                  : s_cl_mult[tid_2] + offset_mult_lambda;
+
+        if (0 != c_block_iend_2)
+        {
+            ptr_blocking_w_2 = s_cl_blocking[tid_2];
+        }
+    }
+
+
+    __syncthreads();
+      if(tid < num_threads_active)
+      {
+        // store compactly in shared mem
+        s_left[ptr_w] = left;
+        s_right[ptr_w] = right;
+        s_left_count[ptr_w] = left_count;
+        s_right_count[ptr_w] = right_count;
+      }
+
+
+    __syncthreads();
+    if(tid == 1)
+    {
+      s_left[ptr_w] = left;
+      s_right[ptr_w] = right;
+      s_left_count[ptr_w] = left_count;
+      s_right_count[ptr_w] = right_count;
+    }
+          if (0 != c_block_iend)
+    {
+        s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1;
+        s_cl_helper[ptr_blocking_w + 1] = c_sum_block;
+    }
+
+    if (tid_2 < num_threads_active)
+    {
+
+        // store compactly in shared mem
+        s_left[ptr_w_2] = left_2;
+        s_right[ptr_w_2] = right_2;
+        s_left_count[ptr_w_2] = left_count_2;
+        s_right_count[ptr_w_2] = right_count_2;
+
+        if (0 != c_block_iend_2)
+        {
+            s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1;
+            s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2;
+        }
+    }
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute addresses to obtain compact list of block start addresses
+////////////////////////////////////////////////////////////////////////////////
+inline __device__
+void
+scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2,
+                              const unsigned int num_threads_compaction,
+                              unsigned short *s_cl_blocking,
+                              unsigned short *s_cl_helper
+                             )
+{
+    // prepare for second step of block generation: compaction of the block
+    // list itself to efficiently write out these
+    s_cl_blocking[tid] = s_cl_helper[tid];
+
+    if (tid_2 < num_threads_compaction)
+    {
+        s_cl_blocking[tid_2] = s_cl_helper[tid_2];
+    }
+
+    __syncthreads();
+
+    // additional scan to compact s_cl_blocking that permits to generate a
+    // compact list of eigenvalue blocks each one containing about
+    // VIENNACL_BISECT_MAX_THREADS_BLOCK eigenvalues (so that each of these blocks may be
+    // processed by one thread block in a subsequent processing step
+
+    unsigned int offset = 1;
+
+    // build scan tree
+    for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
+    {
+
+        __syncthreads();
+
+        if (tid < d)
+        {
+
+            unsigned int  ai = offset*(2*tid+1)-1;
+            unsigned int  bi = offset*(2*tid+2)-1;
+            s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
+        }
+
+        offset <<= 1;
+    }
+
+    // traverse down tree: first down to level 2 across
+    for (int d = 2; d < num_threads_compaction; d <<= 1)
+    {
+
+        offset >>= 1;
+        __syncthreads();
+
+        //
+        if (tid < (d-1))
+        {
+
+            unsigned int  ai = offset*(tid+1) - 1;
+            unsigned int  bi = ai + (offset >> 1);
+            s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
+        }
+    }
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform scan to obtain number of eigenvalues before a specific block
+////////////////////////////////////////////////////////////////////////////////
+inline __device__
+void
+scanSumBlocks(const unsigned int tid, const unsigned int tid_2,
+              const unsigned int num_threads_active,
+              const unsigned int num_threads_compaction,
+              unsigned short *s_cl_blocking,
+              unsigned short *s_cl_helper)
+{
+    unsigned int offset = 1;
+
+    // first step of scan to build the sum of elements within each block
+    // build up tree
+    for (int d = num_threads_compaction >> 1; d > 0; d >>= 1)
+    {
+
+        __syncthreads();
+
+        if (tid < d)
+        {
+
+            unsigned int ai = offset*(2*tid+1)-1;
+            unsigned int bi = offset*(2*tid+2)-1;
+
+            s_cl_blocking[bi] += s_cl_blocking[ai];
+        }
+
+        offset *= 2;
+    }
+
+    // first step of scan to build the sum of elements within each block
+    // traverse down tree
+    for (int d = 2; d < (num_threads_compaction - 1); d <<= 1)
+    {
+
+        offset >>= 1;
+        __syncthreads();
+
+        if (tid < (d-1))
+        {
+
+            unsigned int ai = offset*(tid+1) - 1;
+            unsigned int bi = ai + (offset >> 1);
+
+            s_cl_blocking[bi] += s_cl_blocking[ai];
+        }
+    }
+
+    __syncthreads();
+
+    if (0 == tid)
+    {
+
+        // move last element of scan to last element that is valid
+        // necessary because the number of threads employed for scan is a power
+        // of two and not necessarily the number of active threasd
+        s_cl_helper[num_threads_active - 1] =
+            s_cl_helper[num_threads_compaction - 1];
+        s_cl_blocking[num_threads_active - 1] =
+            s_cl_blocking[num_threads_compaction - 1];
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform initial scan for compaction of intervals containing one and
+//! multiple eigenvalues; also do initial scan to build blocks
+////////////////////////////////////////////////////////////////////////////////
+inline __device__
+void
+scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned int mat_size,
+            const unsigned int num_threads_active,
+            const unsigned int num_threads_compaction,
+            unsigned short *s_cl_one, unsigned short *s_cl_mult,
+            unsigned short *s_cl_blocking, unsigned short *s_cl_helper
+           )
+{
+
+    // perform scan to compactly write out the intervals containing one and
+    // multiple eigenvalues
+    // also generate tree for blocking of intervals containing multiple
+    // eigenvalues
+
+    unsigned int offset = 1;
+
+    // build scan tree
+    for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
+    {
+
+        __syncthreads();
+
+        if (tid < d)
+        {
+
+            unsigned int  ai = offset*(2*tid+1);
+            unsigned int  bi = offset*(2*tid+2)-1;
+
+            s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1];
+            s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1];
+
+            // s_cl_helper is binary and zero for an internal node and 1 for a
+            // root node of a tree corresponding to a block
+            // s_cl_blocking contains the number of nodes in each sub-tree at each
+            // iteration, the data has to be kept to compute the total number of
+            // eigenvalues per block that, in turn, is needed to efficiently
+            // write out data in the second step
+            if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1))
+            {
+
+                // check how many childs are non terminated
+                if (s_cl_helper[ai - 1] == 1)
+                {
+                    // mark as terminated
+                    s_cl_helper[bi] = 1;
+                }
+                else if (s_cl_helper[bi] == 1)
+                {
+                    // mark as terminated
+                    s_cl_helper[ai - 1] = 1;
+                }
+                else    // both childs are non-terminated
+                {
+
+                    unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1];
+
+                    if (temp > (mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2))
+                    {
+
+                        // the two child trees have to form separate blocks, terminate trees
+                        s_cl_helper[ai - 1] = 1;
+                        s_cl_helper[bi] = 1;
+                    }
+                    else
+                    {
+                        // build up tree by joining subtrees
+                        s_cl_blocking[bi] = temp;
+                        s_cl_blocking[ai - 1] = 0;
+                    }
+                }
+            }  // end s_cl_helper update
+
+        }
+
+        offset <<= 1;
+    }
+
+
+    // traverse down tree, this only for stream compaction, not for block
+    // construction
+    for (int d = 2; d < num_threads_compaction; d <<= 1)
+    {
+
+        offset >>= 1;
+        __syncthreads();
+
+        //
+        if (tid < (d-1))
+        {
+
+            unsigned int  ai = offset*(tid+1) - 1;
+            unsigned int  bi = ai + (offset >> 1);
+
+            s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai];
+            s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai];
+        }
+    }
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Store all non-empty intervals resulting from the subdivision of the interval
+//! currently processed by the thread
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+void
+storeNonEmptyIntervalsLarge(unsigned int addr,
+                            const unsigned int num_threads_active,
+                            NumericT  *s_left, NumericT *s_right,
+                            unsigned short  *s_left_count,
+                            unsigned short *s_right_count,
+                            NumericT left, NumericT mid, NumericT right,
+                            const unsigned short left_count,
+                            const unsigned short mid_count,
+                            const unsigned short right_count,
+                            NumericT epsilon,
+                            unsigned int &compact_second_chunk,
+                            unsigned short *s_compaction_list,
+                            unsigned int &is_active_second)
+{
+    // check if both child intervals are valid
+    if ((left_count != mid_count) && (mid_count != right_count))
+    {
+
+        storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+                      left, mid, left_count, mid_count, epsilon);
+
+        is_active_second = 1;
+        s_compaction_list[threadIdx.x] = 1;
+        compact_second_chunk = 1;
+    }
+    else
+    {
+
+        // only one non-empty child interval
+
+        // mark that no second child
+        is_active_second = 0;
+        s_compaction_list[threadIdx.x] = 0;
+
+        // store the one valid child interval
+        if (left_count != mid_count)
+        {
+            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+                          left, mid, left_count, mid_count, epsilon);
+        }
+        else
+        {
+            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+                          mid, right, mid_count, right_count, epsilon);
+        }
+    }
+}
+
+/** @brief Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
+*  g_d  diagonal elements in global memory
+*  g_s  superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed and equals 0
+*  n   size of matrix
+*  lg  lower bound of input interval (e.g. Gerschgorin interval)
+*  ug  upper bound of input interval (e.g. Gerschgorin interval)
+*  lg_eig_count  number of eigenvalues that are smaller than lg
+*  lu_eig_count  number of eigenvalues that are smaller than lu
+*  epsilon  desired accuracy of eigenvalues to compute
+*/
+template<typename NumericT>
+__global__
+void
+bisectKernelLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
+                  const NumericT lg, const NumericT ug,
+                  const unsigned int lg_eig_count,
+                  const unsigned int ug_eig_count,
+                  NumericT epsilon,
+                  unsigned int *g_num_one,
+                  unsigned int *g_num_blocks_mult,
+                  NumericT *g_left_one, NumericT *g_right_one,
+                  unsigned int *g_pos_one,
+                  NumericT *g_left_mult, NumericT *g_right_mult,
+                  unsigned int *g_left_count_mult,
+                  unsigned int *g_right_count_mult,
+                  unsigned int *g_blocks_mult,
+                  unsigned int *g_blocks_mult_sum
+                 )
+{
+    const unsigned int tid = threadIdx.x;
+
+    // intervals (store left and right because the subdivision tree is in general
+    // not dense
+    __shared__  NumericT  s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+    __shared__  NumericT  s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+
+    // number of eigenvalues that are smaller than s_left / s_right
+    // (correspondence is realized via indices)
+    __shared__  unsigned short  s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+    __shared__  unsigned short  s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+
+    // helper for stream compaction
+    __shared__  unsigned short  s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+
+    // state variables for whole block
+    // if 0 then compaction of second chunk of child intervals is not necessary
+    // (because all intervals had exactly one non-dead child)
+    __shared__  unsigned int compact_second_chunk;
+    // if 1 then all threads are converged
+    __shared__  unsigned int all_threads_converged;
+
+    // number of currently active threads
+    __shared__  unsigned int num_threads_active;
+
+    // number of threads to use for stream compaction
+    __shared__  unsigned int num_threads_compaction;
+
+    // helper for exclusive scan
+    unsigned short *s_compaction_list_exc = s_compaction_list + 1;
+
+
+    // variables for currently processed interval
+    // left and right limit of active interval
+    NumericT left = 0.0f;
+    NumericT right = 0.0f;
+    unsigned int left_count = 0;
+    unsigned int right_count = 0;
+    // midpoint of active interval
+    NumericT  mid = 0.0f;
+    // number of eigenvalues smaller then mid
+    unsigned int mid_count = 0;
+    // helper for stream compaction (tracking of threads generating second child)
+    unsigned int is_active_second = 0;
+
+    // initialize lists
+    s_compaction_list[tid] = 0;
+    s_left[tid] = 0;
+    s_right[tid] = 0;
+    s_left_count[tid] = 0;
+    s_right_count[tid] = 0;
+
+    __syncthreads();
+
+    // set up initial configuration
+    if (0 == tid)
+    {
+
+        s_left[0] = lg;
+        s_right[0] = ug;
+        s_left_count[0] = lg_eig_count;
+        s_right_count[0] = ug_eig_count;
+
+        compact_second_chunk = 0;
+        num_threads_active = 1;
+
+        num_threads_compaction = 1;
+
+        all_threads_converged = 1;
+    }
+
+    __syncthreads();
+
+    // for all active threads read intervals from the last level
+    // the number of (worst case) active threads per level l is 2^l
+    // determine coarse intervals. On these intervals the kernel for one or for multiple eigenvalues
+    // will be executed in the second step
+    while(true)
+    {
+        s_compaction_list[tid] = 0;
+        s_compaction_list[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+        s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+        subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count,
+                                num_threads_active,
+                                left, right, left_count, right_count,
+                                mid, all_threads_converged);
+
+        __syncthreads();
+
+        // check if done
+        if (1 == all_threads_converged)
+        {
+            break;
+        }
+
+        // compute number of eigenvalues smaller than mid
+        // use all threads for reading the necessary matrix data from global
+        // memory
+        // use s_left and s_right as scratch space for diagonal and
+        // superdiagonal of matrix
+        mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
+                                                    mid, threadIdx.x,
+                                                    num_threads_active,
+                                                    s_left, s_right,
+                                                    (left == right));
+
+        __syncthreads();
+
+        // store intervals
+        // for all threads store the first child interval in a continuous chunk of
+        // memory, and the second child interval -- if it exists -- in a second
+        // chunk; it is likely that all threads reach convergence up to
+        // \a epsilon at the same level; furthermore, for higher level most / all
+        // threads will have only one child, storing the first child compactly will
+        // (first) avoid to perform a compaction step on the first chunk, (second)
+        // make it for higher levels (when all threads / intervals have
+        // exactly one child)  unnecessary to perform a compaction of the second
+        // chunk
+        if (tid < num_threads_active)
+        {
+
+            if (left != right)
+            {
+
+                // store intervals
+                storeNonEmptyIntervalsLarge(tid, num_threads_active,
+                                            s_left, s_right,
+                                            s_left_count, s_right_count,
+                                            left, mid, right,
+                                            left_count, mid_count, right_count,
+                                            epsilon, compact_second_chunk,
+                                            s_compaction_list_exc,
+                                            is_active_second);
+            }
+            else
+            {
+
+                // re-write converged interval (has to be stored again because s_left
+                // and s_right are used as scratch space for
+                // computeNumSmallerEigenvalsLarge()
+                s_left[tid] = left;
+                s_right[tid] = left;
+                s_left_count[tid] = left_count;
+                s_right_count[tid] = right_count;
+
+                is_active_second = 0;
+            }
+        }
+
+        // necessary so that compact_second_chunk is up-to-date
+        __syncthreads();
+
+        // perform compaction of chunk where second children are stored
+        // scan of (num_threads_active / 2) elements, thus at most
+        // (num_threads_active / 4) threads are needed
+        if (compact_second_chunk > 0)
+        {
+
+            // create indices for compaction
+            createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);
+        }
+        __syncthreads();
+
+        if (compact_second_chunk > 0)
+        {
+            compactIntervals(s_left, s_right, s_left_count, s_right_count,
+                             mid, right, mid_count, right_count,
+                             s_compaction_list, num_threads_active,
+                             is_active_second);
+        }
+
+        __syncthreads();
+
+        // update state variables
+        if (0 == tid)
+        {
+
+            // update number of active threads with result of reduction
+            num_threads_active += s_compaction_list[num_threads_active];
+            num_threads_compaction = ceilPow2(num_threads_active);
+
+            compact_second_chunk = 0;
+            all_threads_converged = 1;
+        }
+
+        __syncthreads();
+
+        if (num_threads_compaction > blockDim.x)
+        {
+            break;
+        }
+
+    }
+
+    __syncthreads();
+
+    // generate two lists of intervals; one with intervals that contain one
+    // eigenvalue (or are converged), and one with intervals that need further
+    // subdivision
+
+    // perform two scans in parallel
+
+    unsigned int left_count_2;
+    unsigned int right_count_2;
+
+    unsigned int tid_2 = tid + blockDim.x;
+
+    // cache in per thread registers so that s_left_count and s_right_count
+    // can be used for scans
+    left_count = s_left_count[tid];
+    right_count = s_right_count[tid];
+
+    // some threads have to cache data for two intervals
+    if (tid_2 < num_threads_active)
+    {
+        left_count_2 = s_left_count[tid_2];
+        right_count_2 = s_right_count[tid_2];
+    }
+
+    // compaction list for intervals containing one and multiple eigenvalues
+    // do not affect first element for exclusive scan
+    unsigned short  *s_cl_one = s_left_count + 1;
+    unsigned short  *s_cl_mult = s_right_count + 1;
+
+    // compaction list for generating blocks of intervals containing multiple
+    // eigenvalues
+    unsigned short  *s_cl_blocking = s_compaction_list_exc;
+    // helper compaction list for generating blocks of intervals
+    __shared__ unsigned short  s_cl_helper[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+
+    if (0 == tid)
+    {
+        // set to 0 for exclusive scan
+        s_left_count[0] = 0;
+        s_right_count[0] = 0;
+
+    }
+
+    __syncthreads();
+
+    // flag if interval contains one or multiple eigenvalues
+    unsigned int is_one_lambda = 0;
+    unsigned int is_one_lambda_2 = 0;
+
+    // number of eigenvalues in the interval
+    unsigned int multiplicity = right_count - left_count;
+    is_one_lambda = (1 == multiplicity);
+
+    s_cl_one[tid] = is_one_lambda;
+    s_cl_mult[tid] = (! is_one_lambda);
+
+    // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
+    s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity;
+    s_cl_helper[tid] = 0;
+
+    if (tid_2 < num_threads_active)
+    {
+
+        unsigned int multiplicity = right_count_2 - left_count_2;
+        is_one_lambda_2 = (1 == multiplicity);
+
+        s_cl_one[tid_2] = is_one_lambda_2;
+        s_cl_mult[tid_2] = (! is_one_lambda_2);
+
+        // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
+        s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity;
+        s_cl_helper[tid_2] = 0;
+    }
+    else if (tid_2 < (2 * (n > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2) + 1))
+    {
+
+        // clear
+        s_cl_blocking[tid_2] = 0;
+        s_cl_helper[tid_2] = 0;
+    }
+
+
+    scanInitial(tid, tid_2, n, num_threads_active, num_threads_compaction,
+                s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper);
+
+    __syncthreads();
+
+    scanSumBlocks(tid, tid_2, num_threads_active,
+                  num_threads_compaction, s_cl_blocking, s_cl_helper);
+
+    // end down sweep of scan
+    __syncthreads();
+
+    unsigned int  c_block_iend = 0;
+    unsigned int  c_block_iend_2 = 0;
+    unsigned int  c_sum_block = 0;
+    unsigned int  c_sum_block_2 = 0;
+
+    // for each thread / interval that corresponds to root node of interval block
+    // store start address of block and total number of eigenvalues in all blocks
+    // before this block (particular thread is irrelevant, constraint is to
+    // have a subset of threads so that one and only one of them is in each
+    // interval)
+    if (1 == s_cl_helper[tid])
+    {
+
+        c_block_iend = s_cl_mult[tid] + 1;
+        c_sum_block = s_cl_blocking[tid];
+    }
+
+    if (1 == s_cl_helper[tid_2])
+    {
+
+        c_block_iend_2 = s_cl_mult[tid_2] + 1;
+        c_sum_block_2 = s_cl_blocking[tid_2];
+    }
+
+    scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction,
+                                  s_cl_blocking, s_cl_helper);
+
+
+    // finished second scan for s_cl_blocking
+    __syncthreads();
+
+    // determine the global results
+    __shared__  unsigned int num_blocks_mult;
+    __shared__  unsigned int num_mult;
+    __shared__  unsigned int offset_mult_lambda;
+
+    if (0 == tid)
+    {
+
+        num_blocks_mult = s_cl_blocking[num_threads_active - 1];
+        offset_mult_lambda = s_cl_one[num_threads_active - 1];
+        num_mult = s_cl_mult[num_threads_active - 1];
+
+        *g_num_one = offset_mult_lambda;
+        *g_num_blocks_mult = num_blocks_mult;
+    }
+
+    __syncthreads();
+
+    NumericT left_2, right_2;
+    --s_cl_one;
+    --s_cl_mult;
+    --s_cl_blocking;
+
+    __syncthreads();
+    compactStreamsFinal(tid, tid_2, num_threads_active, offset_mult_lambda,
+                        s_left, s_right, s_left_count, s_right_count,
+                        s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper,
+                        is_one_lambda, is_one_lambda_2,
+                        left, right, left_2, right_2,
+                        left_count, right_count, left_count_2, right_count_2,
+                        c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2
+                       );
+
+    __syncthreads();
+
+    // final adjustment before writing out data to global memory
+    if (0 == tid)
+    {
+        s_cl_blocking[num_blocks_mult] = num_mult;
+        s_cl_helper[0] = 0;
+    }
+
+    __syncthreads();
+
+    // write to global memory
+    writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult,
+                g_left_one, g_right_one, g_pos_one,
+                g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult,
+                s_left, s_right, s_left_count, s_right_count,
+                g_blocks_mult, g_blocks_mult_sum,
+                s_compaction_list, s_cl_helper, offset_mult_lambda);
+
+}
+}
+}
+}
+#endif // #ifndef _BISECT_KERNEL_LARGE_H_

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
new file mode 100755
index 0000000..a670256
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
@@ -0,0 +1,277 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
+    @brief Second step of the bisection algorithm for the computation of eigenvalues for large matrices.
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+/* Perform second step of bisection algorithm for large matrices for
+ * intervals that contained after the first step more than one eigenvalue
+ */
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+// additional kernel
+#include "viennacl/linalg/cuda/bisect_util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Perform second step of bisection algorithm for large matrices for
+//! intervals that after the first step contained more than one eigenvalue
+//! @param  g_d  diagonal elements of symmetric, tridiagonal matrix
+//! @param  g_s  superdiagonal elements of symmetric, tridiagonal matrix
+//! @param  n    matrix size
+//! @param  blocks_mult  start addresses of blocks of intervals that are
+//!                      processed by one block of threads, each of the
+//!                      intervals contains more than one eigenvalue
+//! @param  blocks_mult_sum  total number of eigenvalues / singleton intervals
+//!                          in one block of intervals
+//! @param  g_left  left limits of intervals
+//! @param  g_right  right limits of intervals
+//! @param  g_left_count  number of eigenvalues less than left limits
+//! @param  g_right_count  number of eigenvalues less than right limits
+//! @param  g_lambda  final eigenvalue
+//! @param  g_pos  index of eigenvalue (in ascending order)
+//! @param  precision  desired precision of eigenvalues
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__global__
+void
+bisectKernelLarge_MultIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
+                                unsigned int *blocks_mult,
+                                unsigned int *blocks_mult_sum,
+                                NumericT *g_left, NumericT *g_right,
+                                unsigned int *g_left_count,
+                                unsigned int *g_right_count,
+                                NumericT *g_lambda, unsigned int *g_pos,
+                                NumericT precision
+                               )
+{
+  const unsigned int tid = threadIdx.x;
+
+    // left and right limits of interval
+    __shared__  NumericT  s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
+    __shared__  NumericT  s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
+
+    // number of eigenvalues smaller than interval limits
+    __shared__  unsigned int  s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
+    __shared__  unsigned int  s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
+
+    // helper array for chunk compaction of second chunk
+    __shared__  unsigned int  s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+    // compaction list helper for exclusive scan
+    unsigned int *s_compaction_list_exc = s_compaction_list + 1;
+
+    // flag if all threads are converged
+    __shared__  unsigned int  all_threads_converged;
+    // number of active threads
+    __shared__  unsigned int  num_threads_active;
+    // number of threads to employ for compaction
+    __shared__  unsigned int  num_threads_compaction;
+    // flag if second chunk has to be compacted
+    __shared__  unsigned int compact_second_chunk;
+
+    // parameters of block of intervals processed by this block of threads
+    __shared__  unsigned int  c_block_start;
+    __shared__  unsigned int  c_block_end;
+    __shared__  unsigned int  c_block_offset_output;
+
+    // midpoint of currently active interval of the thread
+    NumericT mid = 0.0f;
+    // number of eigenvalues smaller than \a mid
+    unsigned int  mid_count = 0;
+    // current interval parameter
+    NumericT  left = 0.0f;
+    NumericT  right = 0.0f;
+    unsigned int  left_count = 0;
+    unsigned int  right_count = 0;
+    // helper for compaction, keep track which threads have a second child
+    unsigned int  is_active_second = 0;
+
+
+    __syncthreads();
+    // initialize common start conditions
+    if (0 == tid)
+    {
+
+        c_block_start = blocks_mult[blockIdx.x];
+        c_block_end = blocks_mult[blockIdx.x + 1];
+        c_block_offset_output = blocks_mult_sum[blockIdx.x];
+
+
+        num_threads_active = c_block_end - c_block_start;
+        s_compaction_list[0] = 0;
+        num_threads_compaction = ceilPow2(num_threads_active);
+
+        all_threads_converged = 1;
+        compact_second_chunk = 0;
+    }
+
+     s_left_count [tid] = 42;
+     s_right_count[tid] = 42;
+     s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+     s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+
+    __syncthreads();
+
+
+    // read data into shared memory
+    if (tid < num_threads_active)
+    {
+        s_left[tid]  = g_left[c_block_start + tid];
+        s_right[tid] = g_right[c_block_start + tid];
+        s_left_count[tid]  = g_left_count[c_block_start + tid];
+        s_right_count[tid] = g_right_count[c_block_start + tid];
+    }
+
+    __syncthreads();
+    unsigned int iter = 0;
+    // do until all threads converged
+    while (true)
+    {
+        iter++;
+        //for (int iter=0; iter < 0; iter++) {
+        s_compaction_list[threadIdx.x] = 0;
+        s_compaction_list[threadIdx.x + blockDim.x] = 0;
+        s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+
+        // subdivide interval if currently active and not already converged
+        subdivideActiveIntervalMulti(tid, s_left, s_right,
+                                s_left_count, s_right_count,
+                                num_threads_active,
+                                left, right, left_count, right_count,
+                                mid, all_threads_converged);
+        __syncthreads();
+
+        // stop if all eigenvalues have been found
+        if (1 == all_threads_converged)
+        {
+
+            break;
+        }
+
+        // compute number of eigenvalues smaller than mid for active and not
+        // converged intervals, use all threads for loading data from gmem and
+        // s_left and s_right as scratch space to store the data load from gmem
+        // in shared memory
+        mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
+                                                    mid, tid, num_threads_active,
+                                                    s_left, s_right,
+                                                    (left == right));
+
+        __syncthreads();
+
+        if (tid < num_threads_active)
+        {
+
+            // store intervals
+            if (left != right)
+            {
+
+                storeNonEmptyIntervals(tid, num_threads_active,
+                                       s_left, s_right, s_left_count, s_right_count,
+                                       left, mid, right,
+                                       left_count, mid_count, right_count,
+                                       precision, compact_second_chunk,
+                                       s_compaction_list_exc,
+                                       is_active_second);
+
+            }
+            else
+            {
+
+                storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,
+                                       left, mid, right,
+                                       left_count, mid_count, right_count,
+                                       s_compaction_list_exc, compact_second_chunk,
+                                       num_threads_active,
+                                       is_active_second);
+
+            }
+        }
+
+        __syncthreads();
+
+        // compact second chunk of intervals if any of the threads generated
+        // two child intervals
+        if (1 == compact_second_chunk)
+        {
+
+            createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);
+            compactIntervals(s_left, s_right, s_left_count, s_right_count,
+                             mid, right, mid_count, right_count,
+                             s_compaction_list, num_threads_active,
+                             is_active_second);
+        }
+
+        __syncthreads();
+
+        // update state variables
+        if (0 == tid)
+        {
+            num_threads_active += s_compaction_list[num_threads_active];
+            num_threads_compaction = ceilPow2(num_threads_active);
+
+            compact_second_chunk = 0;
+            all_threads_converged = 1;
+        }
+
+        __syncthreads();
+
+        // clear
+        s_compaction_list_exc[threadIdx.x] = 0;
+        s_compaction_list_exc[threadIdx.x + blockDim.x] = 0;
+
+        if (num_threads_compaction > blockDim.x)
+        {
+          break;
+        }
+
+
+        __syncthreads();
+
+    }  // end until all threads converged
+
+    // write data back to global memory
+    if (tid < num_threads_active)
+    {
+
+        unsigned int addr = c_block_offset_output + tid;
+
+        g_lambda[addr]  = s_left[tid];
+        g_pos[addr]   = s_right_count[tid];
+    }
+}
+} // namespace cuda
+} // namespace linalg
+} // namespace viennacl
+
+#endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_


[32/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp
new file mode 100644
index 0000000..45d6987
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp
@@ -0,0 +1,1468 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/cuda/matrix_operations_row.hpp
+    @brief Implementations of row-major dense matrix related operations, including matrix-vector products, using CUDA.
+*/
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+template<typename DestNumericT, typename SrcNumericT>
+__global__ void convert_row_kernel(
+          DestNumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const SrcNumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2];
+}
+
+//Matrix transpose kernel
+template<typename NumericT>
+__global__ void trans_kernel(
+          const NumericT * A,
+          unsigned int A_start1,          unsigned int A_start2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+          unsigned int A_size1,           unsigned int A_size2,
+          unsigned int A_stride1,         unsigned int A_stride2,
+
+          NumericT * B,
+          unsigned int B_start1,          unsigned int B_start2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+          unsigned int B_stride1,         unsigned int B_stride2,
+          bool data_major)
+{
+  for(unsigned int row = blockIdx.x; row<A_size1; row+=gridDim.x)
+  {
+    for(unsigned int col = threadIdx.x; col<A_size2; col+=blockDim.x)
+    {
+      if(data_major)
+        B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)];
+      else
+        B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1];
+     }
+  }
+}
+
+//
+// am
+//
+
+// alpha on CPU
+template<typename NumericT>
+__global__ void am_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
+  }
+  else
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
+  }
+}
+
+// alpha on GPU
+template<typename NumericT>
+__global__ void am_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
+  }
+  else
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
+  }
+}
+
+
+//
+// ambm
+//
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void ambm_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          NumericT fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+}
+
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void ambm_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void ambm_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          NumericT fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+}
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void ambm_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+}
+
+
+//
+// ambm_m
+//
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void ambm_m_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          NumericT fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+}
+
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void ambm_m_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void ambm_m_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          NumericT fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+}
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void ambm_m_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+    }
+  }
+}
+
+//
+// assignments
+//
+
+template<typename NumericT>
+__global__ void matrix_row_assign_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+          NumericT alpha)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha;
+}
+
+
+template<typename NumericT>
+__global__ void matrix_row_diagonal_assign_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+          NumericT alpha)
+{
+  unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+
+  for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
+    A[(row * A_inc1 + A_start1) * A_internal_size2 + row * A_inc2 + A_start2] = alpha;
+}
+
+//
+// binary element-wise operations
+//
+
+template<typename NumericT>
+__global__ void element_op_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2,
+
+          unsigned int op_type) //0: product, 1: division, 2: pow
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (op_type == 2)
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+      = pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2],
+            C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]);
+  }
+  else if (op_type == 1)
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+      = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
+      / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
+  }
+  else if (op_type == 0)
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+      = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
+      * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
+  }
+}
+
+template<typename NumericT>
+__global__ void element_op_int_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2,
+
+          unsigned int op_type) //0: product, 1: division, 2: pow
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (op_type == 1)
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+      = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
+      / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
+  }
+  else if (op_type == 0)
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+      = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
+      * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
+  }
+}
+
+//
+// unary element-wise operations
+//
+
+// abs
+template<typename NumericT>
+__global__ void matrix_row_element_abs_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// acos
+template<typename NumericT>
+__global__ void matrix_row_element_acos_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// asin
+template<typename NumericT>
+__global__ void matrix_row_element_asin_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// atan
+template<typename NumericT>
+__global__ void matrix_row_element_atan_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// ceil
+template<typename NumericT>
+__global__ void matrix_row_element_ceil_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// cos
+template<typename NumericT>
+__global__ void matrix_row_element_cos_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// cosh
+template<typename NumericT>
+__global__ void matrix_row_element_cosh_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// exp
+template<typename NumericT>
+__global__ void matrix_row_element_exp_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// fabs
+template<typename NumericT>
+__global__ void matrix_row_element_fabs_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// floor
+template<typename NumericT>
+__global__ void matrix_row_element_floor_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// log
+template<typename NumericT>
+__global__ void matrix_row_element_log_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// log10
+template<typename NumericT>
+__global__ void matrix_row_element_log10_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// sin
+template<typename NumericT>
+__global__ void matrix_row_element_sin_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// sinh
+template<typename NumericT>
+__global__ void matrix_row_element_sinh_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// sqrt
+template<typename NumericT>
+__global__ void matrix_row_element_sqrt_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// tan
+template<typename NumericT>
+__global__ void matrix_row_element_tan_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// tanh
+template<typename NumericT>
+__global__ void matrix_row_element_tanh_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+
+//
+// matrix-vector product
+//
+
+template<typename NumericT>
+__global__ void vec_mul_row_kernel(
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * v,
+          unsigned int v_start,
+          unsigned int v_inc,
+          unsigned int v_size,
+          NumericT * result,
+          unsigned int result_start,
+          unsigned int result_inc,
+          unsigned int result_size)
+{
+  __shared__ NumericT work[128];
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int lid = threadIdx.x;
+
+  for (unsigned int row = row_gid; row < A_row_size; row += gridDim.x)
+  {
+    NumericT dot_prod = 0;
+    for (unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
+      dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
+    work[lid] = dot_prod;
+
+    for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){
+      __syncthreads();
+      if (lid < stride)
+        work[lid] += work[lid+stride];
+    }
+
+    if (lid == 0)
+      result[row * result_inc + result_start] = work[0];
+  }
+}
+
+
+template<typename NumericT>
+__global__ void trans_vec_mul_row_kernel(
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * v,
+          unsigned int v_start,
+          unsigned int v_inc,
+          unsigned int v_size,
+          NumericT * result,
+          unsigned int result_start,
+          unsigned int result_inc,
+          unsigned int result_size)
+{
+  for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_col_size; row += gridDim.x * blockDim.x)
+  {
+    NumericT dot_prod = 0;
+    for (unsigned int col = 0; col < A_row_size; ++col)
+      dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
+    result[row * result_inc + result_start] = dot_prod;
+  }
+}
+
+
+//
+// matrix-matrix products
+//
+
+
+
+
+//
+// scaled rank-1-update
+//
+
+// alpha on CPU
+template<typename NumericT>
+__global__ void scaled_rank1_update_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT val,
+          unsigned int options2,
+
+          const NumericT * vec1,
+          unsigned int start1,
+          unsigned int inc1,
+          unsigned int size1,
+
+          const NumericT * vec2,
+          unsigned int start2,
+          unsigned int inc2,
+          unsigned int size2)
+{
+  NumericT alpha = val;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+  if (options2 & (1 << 1))
+    alpha = NumericT(1) / alpha;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+  {
+    NumericT tmp = alpha * vec1[row * inc1 + start1];
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
+  }
+}
+
+
+// alpha on GPU
+template<typename NumericT>
+__global__ void scaled_rank1_update_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * val,
+          unsigned int options2,
+
+          const NumericT * vec1,
+          unsigned int start1,
+          unsigned int inc1,
+          unsigned int size1,
+
+          const NumericT * vec2,
+          unsigned int start2,
+          unsigned int inc2,
+          unsigned int size2)
+{
+  NumericT alpha = *val;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+  if (options2 & (1 << 1))
+    alpha = NumericT(1) / alpha;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+  {
+    NumericT tmp = alpha * vec1[row * inc1 + start1];
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
+  }
+}
+
+
+
+} // namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp
new file mode 100644
index 0000000..4821f5b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp
@@ -0,0 +1,91 @@
+#ifndef VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/misc_operations.hpp
+    @brief Implementations of miscellaneous operations using CUDA
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+namespace detail
+{
+
+template<typename NumericT>
+__global__ void level_scheduling_substitute_kernel(
+          const unsigned int * row_index_array,
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          NumericT * vec,
+          unsigned int size)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < size;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int eq_row = row_index_array[row];
+    NumericT vec_entry = vec[eq_row];
+    unsigned int row_end = row_indices[row+1];
+
+    for (unsigned int j = row_indices[row]; j < row_end; ++j)
+      vec_entry -= vec[column_indices[j]] * elements[j];
+
+    vec[eq_row] = vec_entry;
+  }
+}
+
+
+
+template<typename NumericT>
+void level_scheduling_substitute(vector<NumericT> & vec,
+                             viennacl::backend::mem_handle const & row_index_array,
+                             viennacl::backend::mem_handle const & row_buffer,
+                             viennacl::backend::mem_handle const & col_buffer,
+                             viennacl::backend::mem_handle const & element_buffer,
+                             vcl_size_t num_rows
+                            )
+{
+  level_scheduling_substitute_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(row_index_array),
+                                                   viennacl::cuda_arg<unsigned int>(row_buffer),
+                                                   viennacl::cuda_arg<unsigned int>(col_buffer),
+                                                   viennacl::cuda_arg<NumericT>(element_buffer),
+                                                   viennacl::cuda_arg(vec),
+                                                   static_cast<unsigned int>(num_rows)
+                                                  );
+}
+
+} //namespace detail
+} //namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp
new file mode 100644
index 0000000..109f74f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp
@@ -0,0 +1,152 @@
+#ifndef VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/vector_operations.hpp
+ @brief Implementations of NMF operations using CUDA
+ */
+
+#include "viennacl/linalg/host_based/nmf_operations.hpp"
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+/** @brief Main CUDA kernel for nonnegative matrix factorization of a dense matrices. */
+template<typename NumericT>
+__global__ void el_wise_mul_div(NumericT       * matrix1,
+                                NumericT const * matrix2,
+                                NumericT const * matrix3,
+                                unsigned int size)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i +=gridDim.x * blockDim.x)
+  {
+    NumericT val = matrix1[i] * matrix2[i];
+    NumericT divisor = matrix3[i];
+    matrix1[i] = (divisor > (NumericT) 0.00001) ? (val / divisor) : NumericT(0);
+  }
+}
+
+/** @brief The nonnegative matrix factorization (approximation) algorithm as suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into matrices W and H such that ||V - W*H|| is minimized.
+ *
+ * @param V     Input matrix
+ * @param W     First factor
+ * @param H     Second factor
+ * @param conf  A configuration object holding tolerances and the like
+ */
+template<typename NumericT>
+void nmf(viennacl::matrix_base<NumericT> const & V,
+         viennacl::matrix_base<NumericT> & W,
+         viennacl::matrix_base<NumericT> & H,
+         viennacl::linalg::nmf_config const & conf)
+{
+  vcl_size_t k = W.size2();
+  conf.iters_ = 0;
+
+  if (!viennacl::linalg::norm_frobenius(W))
+    W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1.0));
+
+  if (!viennacl::linalg::norm_frobenius(H))
+    H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1.0));
+
+  viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major());
+  viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major());
+  viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major());
+
+  viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major());
+  viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major());
+  viennacl::matrix_base<NumericT> htmp(k, k, H.row_major());
+
+  viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major());
+
+  viennacl::vector<NumericT> diff(V.size1() * V.size2());
+
+  NumericT last_diff = 0;
+  NumericT diff_init = 0;
+  bool stagnation_flag = false;
+
+  for (vcl_size_t i = 0; i < conf.max_iterations(); i++)
+  {
+    conf.iters_ = i + 1;
+
+    hn = viennacl::linalg::prod(trans(W), V);
+    htmp = viennacl::linalg::prod(trans(W), W);
+    hd = viennacl::linalg::prod(htmp, H);
+
+    el_wise_mul_div<<<128, 128>>>(viennacl::cuda_arg<NumericT>(H),
+                                  viennacl::cuda_arg<NumericT>(hn),
+                                  viennacl::cuda_arg<NumericT>(hd),
+                                  static_cast<unsigned int>(H.internal_size1() * H.internal_size2()));
+    VIENNACL_CUDA_LAST_ERROR_CHECK("el_wise_mul_div");
+
+    wn   = viennacl::linalg::prod(V, trans(H));
+    wtmp = viennacl::linalg::prod(W, H);
+    wd   = viennacl::linalg::prod(wtmp, trans(H));
+
+    el_wise_mul_div<<<128, 128>>>(viennacl::cuda_arg<NumericT>(W),
+                                  viennacl::cuda_arg<NumericT>(wn),
+                                  viennacl::cuda_arg<NumericT>(wd),
+                                  static_cast<unsigned int>( W.internal_size1() * W.internal_size2()));
+    VIENNACL_CUDA_LAST_ERROR_CHECK("el_wise_mul_div");
+
+    if (i % conf.check_after_steps() == 0)  //check for convergence
+    {
+      appr = viennacl::linalg::prod(W, H);
+
+      appr -= V;
+      NumericT diff_val = viennacl::linalg::norm_frobenius(appr);
+
+      if (i == 0)
+        diff_init = diff_val;
+
+      if (conf.print_relative_error())
+        std::cout << diff_val / diff_init << std::endl;
+
+      // Approximation check
+      if (diff_val / diff_init < conf.tolerance())
+        break;
+
+      // Stagnation check
+      if (std::fabs(diff_val - last_diff) / (diff_val * conf.check_after_steps()) < conf.stagnation_tolerance()) //avoid situations where convergence stagnates
+      {
+        if (stagnation_flag)  // iteration stagnates (two iterates with no notable progress)
+          break;
+        else
+          // record stagnation in this iteration
+          stagnation_flag = true;
+      } else
+        // good progress in this iteration, so unset stagnation flag
+        stagnation_flag = false;
+
+      // prepare for next iterate:
+      last_diff = diff_val;
+    }
+  }
+}
+
+} //namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_ */

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp
new file mode 100644
index 0000000..3adaca2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp
@@ -0,0 +1,375 @@
+#ifndef VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/scalar_operations.hpp
+    @brief Implementations of scalar operations using CUDA
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+// includes CUDA
+#include <cuda_runtime.h>
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+/////////////////// as /////////////////////////////
+
+template<typename NumericT>
+__global__ void as_kernel(NumericT * s1, const NumericT * fac2, unsigned int options2, const NumericT * s2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+  if (options2 & (1 << 1))
+    alpha = NumericT(1) / alpha;
+
+  *s1 = *s2 * alpha;
+}
+
+template<typename NumericT>
+__global__ void as_kernel(NumericT * s1, NumericT fac2, unsigned int options2, const NumericT * s2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+  if (options2 & (1 << 1))
+    alpha = NumericT(1) / alpha;
+
+  *s1 = *s2 * alpha;
+}
+
+template<typename ScalarT1,
+         typename ScalarT2, typename NumericT>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_any_scalar<NumericT>::value
+                            >::type
+as(ScalarT1 & s1,
+   ScalarT2 const & s2, NumericT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<NumericT>::value)
+    temporary_alpha = alpha;
+
+  as_kernel<<<1, 1>>>(viennacl::cuda_arg(s1),
+                      viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                      options_alpha,
+                      viennacl::cuda_arg(s2));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("as_kernel");
+}
+
+//////////////////// asbs ////////////////////////////
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void asbs_kernel(NumericT * s1,
+                            const NumericT * fac2, unsigned int options2, const NumericT * s2,
+                            const NumericT * fac3, unsigned int options3, const NumericT * s3)
+{
+    NumericT alpha = *fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = *fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 = *s2 * alpha + *s3 * beta;
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void asbs_kernel(NumericT * s1,
+                            NumericT fac2,         unsigned int options2, const NumericT * s2,
+                            NumericT const * fac3, unsigned int options3, const NumericT * s3)
+{
+    NumericT alpha = fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = *fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 = *s2 * alpha + *s3 * beta;
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void asbs_kernel(NumericT * s1,
+                            NumericT const * fac2, unsigned int options2, const NumericT * s2,
+                            NumericT         fac3, unsigned int options3, const NumericT * s3)
+{
+    NumericT alpha = *fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 = *s2 * alpha + *s3 * beta;
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void asbs_kernel(NumericT * s1,
+                            NumericT fac2, unsigned int options2, const NumericT * s2,
+                            NumericT fac3, unsigned int options3, const NumericT * s3)
+{
+    NumericT alpha = fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 = *s2 * alpha + *s3 * beta;
+}
+
+
+template<typename ScalarT1,
+         typename ScalarT2, typename NumericT1,
+         typename ScalarT3, typename NumericT2>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_scalar<ScalarT3>::value
+                              && viennacl::is_any_scalar<NumericT1>::value
+                              && viennacl::is_any_scalar<NumericT2>::value
+                            >::type
+asbs(ScalarT1 & s1,
+     ScalarT2 const & s2, NumericT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+     ScalarT3 const & s3, NumericT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+  unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<NumericT1>::value)
+    temporary_alpha = alpha;
+
+  value_type temporary_beta = 0;
+  if (viennacl::is_cpu_scalar<NumericT2>::value)
+    temporary_beta = beta;
+
+  asbs_kernel<<<1, 1>>>(viennacl::cuda_arg(s1),
+                        viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                        options_alpha,
+                        viennacl::cuda_arg(s2),
+                        viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                        options_beta,
+                        viennacl::cuda_arg(s3) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_kernel");
+}
+
+//////////////////// asbs_s ////////////////////
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void asbs_s_kernel(NumericT * s1,
+                              const NumericT * fac2, unsigned int options2, const NumericT * s2,
+                              const NumericT * fac3, unsigned int options3, const NumericT * s3)
+{
+    NumericT alpha = *fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = *fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 += *s2 * alpha + *s3 * beta;
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void asbs_s_kernel(NumericT * s1,
+                              NumericT         fac2, unsigned int options2, const NumericT * s2,
+                              NumericT const * fac3, unsigned int options3, const NumericT * s3)
+{
+    NumericT alpha = fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = *fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 += *s2 * alpha + *s3 * beta;
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void asbs_s_kernel(NumericT * s1,
+                              NumericT const * fac2, unsigned int options2, const NumericT * s2,
+                              NumericT         fac3, unsigned int options3, const NumericT * s3)
+{
+    NumericT alpha = *fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 += *s2 * alpha + *s3 * beta;
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void asbs_s_kernel(NumericT * s1,
+                              NumericT fac2, unsigned int options2, const NumericT * s2,
+                              NumericT fac3, unsigned int options3, const NumericT * s3)
+{
+    NumericT alpha = fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 += *s2 * alpha + *s3 * beta;
+}
+
+
+template<typename ScalarT1,
+         typename ScalarT2, typename NumericT1,
+         typename ScalarT3, typename NumericT2>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_scalar<ScalarT3>::value
+                              && viennacl::is_any_scalar<NumericT1>::value
+                              && viennacl::is_any_scalar<NumericT2>::value
+                            >::type
+asbs_s(ScalarT1 & s1,
+       ScalarT2 const & s2, NumericT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+       ScalarT3 const & s3, NumericT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+  unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<NumericT1>::value)
+    temporary_alpha = alpha;
+
+  value_type temporary_beta = 0;
+  if (viennacl::is_cpu_scalar<NumericT2>::value)
+    temporary_beta = beta;
+
+  std::cout << "Launching asbs_s_kernel..." << std::endl;
+  asbs_s_kernel<<<1, 1>>>(viennacl::cuda_arg(s1),
+                          viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                          options_alpha,
+                          viennacl::cuda_arg(s2),
+                          viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                          options_beta,
+                          viennacl::cuda_arg(s3) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_s_kernel");
+}
+
+///////////////// swap //////////////////
+
+template<typename NumericT>
+__global__ void scalar_swap_kernel(NumericT * s1, NumericT * s2)
+{
+  NumericT tmp = *s2;
+  *s2 = *s1;
+  *s1 = tmp;
+}
+
+/** @brief Swaps the contents of two scalars, data is copied
+*
+* @param s1   The first scalar
+* @param s2   The second scalar
+*/
+template<typename ScalarT1, typename ScalarT2>
+typename viennacl::enable_if<    viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                            >::type
+swap(ScalarT1 & s1, ScalarT2 & s2)
+{
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        value_type;
+
+  scalar_swap_kernel<<<1, 1>>>(viennacl::cuda_arg(s1), viennacl::cuda_arg(s2));
+}
+
+
+
+} //namespace single_threaded
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif


[19/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/misc_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/misc_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/misc_operations.hpp
new file mode 100644
index 0000000..11061d9
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/misc_operations.hpp
@@ -0,0 +1,80 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/misc_operations.hpp
+    @brief Implementations of miscellaneous operations on the CPU using a single thread or OpenMP.
+*/
+
+#include <list>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+namespace detail
+{
+  template<typename NumericT>
+  void level_scheduling_substitute(vector<NumericT> & vec,
+                                   viennacl::backend::mem_handle const & row_index_array,
+                                   viennacl::backend::mem_handle const & row_buffer,
+                                   viennacl::backend::mem_handle const & col_buffer,
+                                   viennacl::backend::mem_handle const & element_buffer,
+                                   vcl_size_t num_rows
+                                  )
+  {
+    NumericT * vec_buf = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(vec.handle());
+
+    unsigned int const * elim_row_index  = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(row_index_array);
+    unsigned int const * elim_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(row_buffer);
+    unsigned int const * elim_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(col_buffer);
+    NumericT     const * elim_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(element_buffer);
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long row=0; row < static_cast<long>(num_rows); ++row)
+    {
+      unsigned int  eq_row = elim_row_index[row];
+      unsigned int row_end = elim_row_buffer[row+1];
+      NumericT   vec_entry = vec_buf[eq_row];
+
+      for (vcl_size_t j = elim_row_buffer[row]; j < row_end; ++j)
+        vec_entry -= vec_buf[elim_col_buffer[j]] * elim_elements[j];
+
+      vec_buf[eq_row] = vec_entry;
+    }
+
+  }
+}
+
+} // namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/nmf_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/nmf_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/nmf_operations.hpp
new file mode 100644
index 0000000..bb6557f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/nmf_operations.hpp
@@ -0,0 +1,247 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_NMF_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_NMF_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+
+/** @file viennacl/linalg/host_based/vector_operations.hpp
+ @brief Implementations of NMF operations using a plain single-threaded or OpenMP-enabled execution on CPU
+ */
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_frobenius.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief Configuration class for the nonnegative-matrix-factorization algorithm. Specify tolerances, maximum iteration counts, etc., here. */
+class nmf_config
+{
+public:
+  nmf_config(double val_epsilon = 1e-4, double val_epsilon_stagnation = 1e-5,
+      vcl_size_t num_max_iters = 10000, vcl_size_t num_check_iters = 100) :
+      eps_(val_epsilon), stagnation_eps_(val_epsilon_stagnation), max_iters_(num_max_iters), check_after_steps_(
+          (num_check_iters > 0) ? num_check_iters : 1), print_relative_error_(false), iters_(0)
+  {
+  }
+
+  /** @brief Returns the relative tolerance for convergence */
+  double tolerance() const
+  {
+    return eps_;
+  }
+
+  /** @brief Sets the relative tolerance for convergence, i.e. norm(V - W * H) / norm(V - W_init * H_init) */
+  void tolerance(double e)
+  {
+    eps_ = e;
+  }
+
+  /** @brief Relative tolerance for the stagnation check */
+  double stagnation_tolerance() const
+  {
+    return stagnation_eps_;
+  }
+
+  /** @brief Sets the tolerance for the stagnation check (i.e. the minimum required relative change of the residual between two iterations) */
+  void stagnation_tolerance(double e)
+  {
+    stagnation_eps_ = e;
+  }
+
+  /** @brief Returns the maximum number of iterations for the NMF algorithm */
+  vcl_size_t max_iterations() const
+  {
+    return max_iters_;
+  }
+  /** @brief Sets the maximum number of iterations for the NMF algorithm */
+  void max_iterations(vcl_size_t m)
+  {
+    max_iters_ = m;
+  }
+
+  /** @brief Returns the number of iterations of the last NMF run using this configuration object */
+  vcl_size_t iters() const
+  {
+    return iters_;
+  }
+
+  /** @brief Number of steps after which the convergence of NMF should be checked (again) */
+  vcl_size_t check_after_steps() const
+  {
+    return check_after_steps_;
+  }
+  /** @brief Set the number of steps after which the convergence of NMF should be checked (again) */
+  void check_after_steps(vcl_size_t c)
+  {
+    if (c > 0)
+      check_after_steps_ = c;
+  }
+
+  /** @brief Returns the flag specifying whether the relative tolerance should be printed in each iteration */
+  bool print_relative_error() const
+  {
+    return print_relative_error_;
+  }
+  /** @brief Specify whether the relative error should be printed at each convergence check after 'num_check_iters' steps */
+  void print_relative_error(bool b)
+  {
+    print_relative_error_ = b;
+  }
+
+  template<typename ScalarType>
+  friend void nmf(viennacl::matrix_base<ScalarType> const & V,
+      viennacl::matrix_base<ScalarType> & W, viennacl::matrix_base<ScalarType> & H,
+      nmf_config const & conf);
+
+private:
+  double eps_;
+  double stagnation_eps_;
+  vcl_size_t max_iters_;
+  vcl_size_t check_after_steps_;
+  bool print_relative_error_;
+public:
+  mutable vcl_size_t iters_;
+};
+
+namespace host_based
+{
+  /** @brief Missing OpenMP kernel for nonnegative matrix factorization of a dense matrices. */
+  template<typename NumericT>
+  void el_wise_mul_div(NumericT       * matrix1,
+                       NumericT const * matrix2,
+                       NumericT const * matrix3, vcl_size_t size)
+  {
+#ifdef VIENNACL_WITH_OPENMP
+#pragma omp parallel for
+#endif
+    for (long i2 = 0; i2 < long(size); i2++)
+    {
+      vcl_size_t i = vcl_size_t(i2);
+      NumericT val     = matrix1[i] * matrix2[i];
+      NumericT divisor = matrix3[i];
+      matrix1[i] = (divisor > (NumericT) 0.00001) ? (val / divisor) : (NumericT) 0;
+    }
+  }
+
+  /** @brief The nonnegative matrix factorization (approximation) algorithm as suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into matrices W and H such that ||V - W*H|| is minimized.
+   *
+   * @param V     Input matrix
+   * @param W     First factor
+   * @param H     Second factor
+   * @param conf  A configuration object holding tolerances and the like
+   */
+  template<typename NumericT>
+  void nmf(viennacl::matrix_base<NumericT> const & V,
+           viennacl::matrix_base<NumericT> & W,
+           viennacl::matrix_base<NumericT> & H,
+           viennacl::linalg::nmf_config const & conf)
+  {
+    vcl_size_t k = W.size2();
+    conf.iters_ = 0;
+
+    if (viennacl::linalg::norm_frobenius(W) <= 0)
+      W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1.0));
+
+    if (viennacl::linalg::norm_frobenius(H) <= 0)
+      H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1.0));
+
+    viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major());
+    viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major());
+    viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major());
+
+    viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major());
+    viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major());
+    viennacl::matrix_base<NumericT> htmp(k, k, H.row_major());
+
+    viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major());
+
+    NumericT last_diff = 0;
+    NumericT diff_init = 0;
+    bool stagnation_flag = false;
+
+    for (vcl_size_t i = 0; i < conf.max_iterations(); i++)
+    {
+      conf.iters_ = i + 1;
+
+      hn   = viennacl::linalg::prod(trans(W), V);
+      htmp = viennacl::linalg::prod(trans(W), W);
+      hd   = viennacl::linalg::prod(htmp, H);
+
+      NumericT * data_H  = detail::extract_raw_pointer<NumericT>(H);
+      NumericT * data_hn = detail::extract_raw_pointer<NumericT>(hn);
+      NumericT * data_hd = detail::extract_raw_pointer<NumericT>(hd);
+
+      viennacl::linalg::host_based::el_wise_mul_div(data_H, data_hn, data_hd, H.internal_size1() * H.internal_size2());
+
+      wn   = viennacl::linalg::prod(V, trans(H));
+      wtmp = viennacl::linalg::prod(W, H);
+      wd   = viennacl::linalg::prod(wtmp, trans(H));
+
+      NumericT * data_W  = detail::extract_raw_pointer<NumericT>(W);
+      NumericT * data_wn = detail::extract_raw_pointer<NumericT>(wn);
+      NumericT * data_wd = detail::extract_raw_pointer<NumericT>(wd);
+
+      viennacl::linalg::host_based::el_wise_mul_div(data_W, data_wn, data_wd, W.internal_size1() * W.internal_size2());
+
+      if (i % conf.check_after_steps() == 0)  //check for convergence
+      {
+        appr = viennacl::linalg::prod(W, H);
+
+        appr -= V;
+        NumericT diff_val = viennacl::linalg::norm_frobenius(appr);
+
+        if (i == 0)
+          diff_init = diff_val;
+
+        if (conf.print_relative_error())
+          std::cout << diff_val / diff_init << std::endl;
+
+        // Approximation check
+        if (diff_val / diff_init < conf.tolerance())
+          break;
+
+        // Stagnation check
+        if (std::fabs(diff_val - last_diff) / (diff_val * NumericT(conf.check_after_steps())) < conf.stagnation_tolerance()) //avoid situations where convergence stagnates
+        {
+          if (stagnation_flag)    // iteration stagnates (two iterates with no notable progress)
+            break;
+          else
+            // record stagnation in this iteration
+            stagnation_flag = true;
+        } else
+          // good progress in this iteration, so unset stagnation flag
+          stagnation_flag = false;
+
+        // prepare for next iterate:
+        last_diff = diff_val;
+      }
+    }
+  }
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* VIENNACL_LINALG_HOST_BASED_NMF_OPERATIONS_HPP_ */

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/scalar_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/scalar_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/scalar_operations.hpp
new file mode 100644
index 0000000..f8a1f3b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/scalar_operations.hpp
@@ -0,0 +1,162 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_SCALAR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_SCALAR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/scalar_operations.hpp
+    @brief Implementations of scalar operations using a plain single-threaded or OpenMP-enabled execution on CPU
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+template<typename ScalarT1,
+         typename ScalarT2, typename FactorT>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_any_scalar<FactorT>::value
+                            >::type
+as(ScalarT1       & s1,
+   ScalarT2 const & s2, FactorT const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        value_type;
+
+  value_type       * data_s1 = detail::extract_raw_pointer<value_type>(s1);
+  value_type const * data_s2 = detail::extract_raw_pointer<value_type>(s2);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = -data_alpha;
+  if (reciprocal_alpha)
+    data_alpha = static_cast<value_type>(1) / data_alpha;
+
+  *data_s1 = *data_s2 * data_alpha;
+}
+
+
+template<typename ScalarT1,
+         typename ScalarT2, typename FactorT2,
+         typename ScalarT3, typename FactorT3>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_scalar<ScalarT3>::value
+                              && viennacl::is_any_scalar<FactorT2>::value
+                              && viennacl::is_any_scalar<FactorT3>::value
+                            >::type
+asbs(ScalarT1       & s1,
+     ScalarT2 const & s2, FactorT2 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+     ScalarT3 const & s3, FactorT3 const & beta,  vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        value_type;
+
+  value_type       * data_s1 = detail::extract_raw_pointer<value_type>(s1);
+  value_type const * data_s2 = detail::extract_raw_pointer<value_type>(s2);
+  value_type const * data_s3 = detail::extract_raw_pointer<value_type>(s3);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = -data_alpha;
+  if (reciprocal_alpha)
+    data_alpha = static_cast<value_type>(1) / data_alpha;
+
+  value_type data_beta = beta;
+  if (flip_sign_beta)
+    data_beta = -data_beta;
+  if (reciprocal_beta)
+    data_beta = static_cast<value_type>(1) / data_beta;
+
+  *data_s1 = *data_s2 * data_alpha + *data_s3 * data_beta;
+}
+
+
+template<typename ScalarT1,
+         typename ScalarT2, typename FactorT2,
+         typename ScalarT3, typename FactorT3>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_scalar<ScalarT3>::value
+                              && viennacl::is_any_scalar<FactorT2>::value
+                              && viennacl::is_any_scalar<FactorT3>::value
+                            >::type
+asbs_s(ScalarT1       & s1,
+       ScalarT2 const & s2, FactorT2 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+       ScalarT3 const & s3, FactorT3 const & beta,  vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        value_type;
+
+  value_type       * data_s1 = detail::extract_raw_pointer<value_type>(s1);
+  value_type const * data_s2 = detail::extract_raw_pointer<value_type>(s2);
+  value_type const * data_s3 = detail::extract_raw_pointer<value_type>(s3);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = -data_alpha;
+  if (reciprocal_alpha)
+    data_alpha = static_cast<value_type>(1) / data_alpha;
+
+  value_type data_beta = beta;
+  if (flip_sign_beta)
+    data_beta = -data_beta;
+  if (reciprocal_beta)
+    data_beta = static_cast<value_type>(1) / data_beta;
+
+  *data_s1 += *data_s2 * data_alpha + *data_s3 * data_beta;
+}
+
+
+/** @brief Swaps the contents of two scalars, data is copied
+*
+* @param s1   The first scalar
+* @param s2   The second scalar
+*/
+template<typename ScalarT1, typename ScalarT2>
+typename viennacl::enable_if<    viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                            >::type
+swap(ScalarT1 & s1, ScalarT2 & s2)
+{
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        value_type;
+
+  value_type * data_s1 = detail::extract_raw_pointer<value_type>(s1);
+  value_type * data_s2 = detail::extract_raw_pointer<value_type>(s2);
+
+  value_type temp = *data_s2;
+  *data_s2 = *data_s1;
+  *data_s1 = temp;
+}
+
+
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif


[47/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/init_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/init_matrix.hpp b/native-viennaCL/src/main/cpp/libviennacl/src/init_matrix.hpp
new file mode 100644
index 0000000..e463e88
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/init_matrix.hpp
@@ -0,0 +1,101 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl.hpp"
+#include "viennacl/backend/mem_handle.hpp"
+
+
+
+static ViennaCLStatus init_cuda_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+#ifdef VIENNACL_WITH_CUDA
+  h.switch_active_handle_id(viennacl::CUDA_MEMORY);
+  h.cuda_handle().reset(A->cuda_mem);
+  h.cuda_handle().inc();
+  if (A->precision == ViennaCLFloat)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(float)); // not necessary, but still set for conciseness
+  else if (A->precision == ViennaCLDouble)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+#else
+  (void)h;
+  (void)A;
+  return ViennaCLGenericFailure;
+#endif
+}
+
+static ViennaCLStatus init_opencl_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+#ifdef VIENNACL_WITH_OPENCL
+  h.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+  h.opencl_handle() = A->opencl_mem;
+  h.opencl_handle().inc();
+  if (A->precision == ViennaCLFloat)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(float)); // not necessary, but still set for conciseness
+  else if (A->precision == ViennaCLDouble)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+#else
+  (void)h;
+  (void)A;
+  return ViennaCLGenericFailure;
+#endif
+}
+
+
+static ViennaCLStatus init_host_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+  h.switch_active_handle_id(viennacl::MAIN_MEMORY);
+  h.ram_handle().reset(A->host_mem);
+  h.ram_handle().inc();
+  if (A->precision == ViennaCLFloat)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(float)); // not necessary, but still set for conciseness
+  else if (A->precision == ViennaCLDouble)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+}
+
+
+static ViennaCLStatus init_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+  switch (A->backend->backend_type)
+  {
+    case ViennaCLCUDA:
+      return init_cuda_matrix(h, A);
+
+    case ViennaCLOpenCL:
+      return init_opencl_matrix(h, A);
+
+    case ViennaCLHost:
+      return init_host_matrix(h, A);
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/init_vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/init_vector.hpp b/native-viennaCL/src/main/cpp/libviennacl/src/init_vector.hpp
new file mode 100644
index 0000000..8be00d7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/init_vector.hpp
@@ -0,0 +1,101 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl.hpp"
+#include "viennacl/backend/mem_handle.hpp"
+
+
+
+static ViennaCLStatus init_cuda_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+#ifdef VIENNACL_WITH_CUDA
+  h.switch_active_handle_id(viennacl::CUDA_MEMORY);
+  h.cuda_handle().reset(x->cuda_mem);
+  h.cuda_handle().inc();
+  if (x->precision == ViennaCLFloat)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * x->size * sizeof(float)); // not necessary, but still set for conciseness
+  else if (x->precision == ViennaCLDouble)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * x->size * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+#else
+  (void)h;
+  (void)x;
+  return ViennaCLGenericFailure;
+#endif
+}
+
+static ViennaCLStatus init_opencl_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+#ifdef VIENNACL_WITH_OPENCL
+  h.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+  h.opencl_handle() = x->opencl_mem;
+  h.opencl_handle().inc();
+  if (x->precision == ViennaCLFloat)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * static_cast<viennacl::vcl_size_t>(x->size) * sizeof(float)); // not necessary, but still set for conciseness
+  else if (x->precision == ViennaCLDouble)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * static_cast<viennacl::vcl_size_t>(x->size) * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+#else
+  (void)h;
+  (void)x;
+  return ViennaCLGenericFailure;
+#endif
+}
+
+
+static ViennaCLStatus init_host_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+  h.switch_active_handle_id(viennacl::MAIN_MEMORY);
+  h.ram_handle().reset(x->host_mem);
+  h.ram_handle().inc();
+  if (x->precision == ViennaCLFloat)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * static_cast<viennacl::vcl_size_t>(x->size) * sizeof(float)); // not necessary, but still set for conciseness
+  else if (x->precision == ViennaCLDouble)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * static_cast<viennacl::vcl_size_t>(x->size) * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+}
+
+
+static ViennaCLStatus init_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+  switch (x->backend->backend_type)
+  {
+    case ViennaCLCUDA:
+      return init_cuda_vector(h, x);
+
+    case ViennaCLOpenCL:
+      return init_opencl_vector(h, x);
+
+    case ViennaCLHost:
+      return init_host_vector(h, x);
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/viennacl_private.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/viennacl_private.hpp b/native-viennaCL/src/main/cpp/libviennacl/src/viennacl_private.hpp
new file mode 100644
index 0000000..c66c848
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/viennacl_private.hpp
@@ -0,0 +1,141 @@
+#ifndef VIENNACL_VIENNACL_PRIVATE_HPP
+#define VIENNACL_VIENNACL_PRIVATE_HPP
+
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <stdlib.h>
+
+#ifdef VIENNACL_WITH_OPENCL
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+#endif
+
+#include "viennacl.hpp"
+
+
+/************* Backend Management ******************/
+
+struct ViennaCLCUDABackend_impl
+{
+    //TODO: Add stream and/or device descriptors here
+};
+
+struct ViennaCLOpenCLBackend_impl
+{
+  ViennaCLInt context_id;
+};
+
+struct ViennaCLHostBackend_impl
+{
+  // Nothing to specify *at the moment*
+};
+
+
+/** @brief Generic backend for CUDA, OpenCL, host-based stuff */
+struct ViennaCLBackend_impl
+{
+  ViennaCLBackendTypes backend_type;
+
+  ViennaCLCUDABackend_impl     cuda_backend;
+  ViennaCLOpenCLBackend_impl   opencl_backend;
+  ViennaCLHostBackend_impl     host_backend;
+};
+
+
+
+/******** User Types **********/
+
+struct ViennaCLHostScalar_impl
+{
+  ViennaCLPrecision  precision;
+
+  union {
+    float  value_float;
+    double value_double;
+  };
+};
+
+struct ViennaCLScalar_impl
+{
+  ViennaCLBackend    backend;
+  ViennaCLPrecision  precision;
+
+  // buffer:
+#ifdef VIENNACL_WITH_CUDA
+  char * cuda_mem;
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  cl_mem opencl_mem;
+#endif
+  char * host_mem;
+
+  ViennaCLInt   offset;
+};
+
+struct ViennaCLVector_impl
+{
+  ViennaCLBackend    backend;
+  ViennaCLPrecision  precision;
+
+  // buffer:
+#ifdef VIENNACL_WITH_CUDA
+  char * cuda_mem;
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  cl_mem opencl_mem;
+#endif
+  char * host_mem;
+
+  ViennaCLInt   offset;
+  ViennaCLInt   inc;
+  ViennaCLInt   size;
+};
+
+struct ViennaCLMatrix_impl
+{
+  ViennaCLBackend    backend;
+  ViennaCLPrecision  precision;
+  ViennaCLOrder      order;
+  ViennaCLTranspose  trans;
+
+  // buffer:
+#ifdef VIENNACL_WITH_CUDA
+  char * cuda_mem;
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  cl_mem opencl_mem;
+#endif
+  char * host_mem;
+
+  ViennaCLInt   size1;
+  ViennaCLInt   start1;
+  ViennaCLInt   stride1;
+  ViennaCLInt   internal_size1;
+
+  ViennaCLInt   size2;
+  ViennaCLInt   start2;
+  ViennaCLInt   stride2;
+  ViennaCLInt   internal_size2;
+};
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/cpu_ram.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/cpu_ram.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/cpu_ram.hpp
new file mode 100644
index 0000000..ccfd035
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/cpu_ram.hpp
@@ -0,0 +1,171 @@
+#ifndef VIENNACL_BACKEND_CPU_RAM_HPP_
+#define VIENNACL_BACKEND_CPU_RAM_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/cpu_ram.hpp
+    @brief Implementations for the OpenCL backend functionality
+*/
+
+#include <cassert>
+#include <vector>
+#ifdef VIENNACL_WITH_AVX2
+#include <stdlib.h>
+#endif
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/shared_ptr.hpp"
+
+namespace viennacl
+{
+namespace backend
+{
+namespace cpu_ram
+{
+typedef viennacl::tools::shared_ptr<char>  handle_type;
+// Requirements for backend:
+
+// * memory_create(size, host_ptr)
+// * memory_copy(src, dest, offset_src, offset_dest, size)
+// * memory_write_from_main_memory(src, offset, size,
+//                                 dest, offset, size)
+// * memory_read_to_main_memory(src, offset, size
+//                              dest, offset, size)
+// *
+//
+
+namespace detail
+{
+  /** @brief Helper struct for deleting an pointer to an array */
+  template<class U>
+  struct array_deleter
+  {
+#ifdef VIENNACL_WITH_AVX2
+    void operator()(U* p) const { free(p); }
+#else
+    void operator()(U* p) const { delete[] p; }
+#endif
+  };
+
+}
+
+/** @brief Creates an array of the specified size in main RAM. If the second argument is provided, the buffer is initialized with data from that pointer.
+ *
+ * @param size_in_bytes   Number of bytes to allocate
+ * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+ *
+ */
+inline handle_type  memory_create(vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+{
+#ifdef VIENNACL_WITH_AVX2
+  // Note: aligned_alloc not available on all compilers. Consider platform-specific alternatives such as posix_memalign()
+  if (!host_ptr)
+    return handle_type(reinterpret_cast<char*>(aligned_alloc(32, size_in_bytes)), detail::array_deleter<char>());
+
+  handle_type new_handle(reinterpret_cast<char*>(aligned_alloc(32, size_in_bytes)), detail::array_deleter<char>());
+#else
+  if (!host_ptr)
+    return handle_type(new char[size_in_bytes], detail::array_deleter<char>());
+
+  handle_type new_handle(new char[size_in_bytes], detail::array_deleter<char>());
+#endif
+
+  // copy data:
+  char * raw_ptr = new_handle.get();
+  const char * data_ptr = static_cast<const char *>(host_ptr);
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+  for (long i=0; i<long(size_in_bytes); ++i)
+    raw_ptr[i] = data_ptr[i];
+
+  return new_handle;
+}
+
+/** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' to memory starting at address 'dst_buffer + dst_offset'.
+ *
+ *  @param src_buffer     A smart pointer to the begin of an allocated buffer
+ *  @param dst_buffer     A smart pointer to the end of an allocated buffer
+ *  @param src_offset     Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+ *  @param dst_offset     Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+ *  @param bytes_to_copy  Number of bytes to be copied
+ */
+inline void memory_copy(handle_type const & src_buffer,
+                        handle_type & dst_buffer,
+                        vcl_size_t src_offset,
+                        vcl_size_t dst_offset,
+                        vcl_size_t bytes_to_copy)
+{
+  assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+  assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i=0; i<long(bytes_to_copy); ++i)
+    dst_buffer.get()[vcl_size_t(i)+dst_offset] = src_buffer.get()[vcl_size_t(i) + src_offset];
+}
+
+/** @brief Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'
+ *
+ * @param dst_buffer    A smart pointer to the beginning of an allocated buffer
+ * @param dst_offset    Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+ * @param bytes_to_copy Number of bytes to be copied
+ * @param ptr           Pointer to the first byte to be written
+ */
+inline void memory_write(handle_type & dst_buffer,
+                         vcl_size_t dst_offset,
+                         vcl_size_t bytes_to_copy,
+                         const void * ptr,
+                         bool /*async*/)
+{
+  assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i=0; i<long(bytes_to_copy); ++i)
+    dst_buffer.get()[vcl_size_t(i)+dst_offset] = static_cast<const char *>(ptr)[i];
+}
+
+/** @brief Reads data from a buffer back to main RAM.
+ *
+ * @param src_buffer         A smart pointer to the beginning of an allocated source buffer
+ * @param src_offset         Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+ * @param bytes_to_copy      Number of bytes to be read
+ * @param ptr                Location in main RAM where to read data should be written to
+ */
+inline void memory_read(handle_type const & src_buffer,
+                        vcl_size_t src_offset,
+                        vcl_size_t bytes_to_copy,
+                        void * ptr,
+                        bool /*async*/)
+{
+  assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i=0; i<long(bytes_to_copy); ++i)
+    static_cast<char *>(ptr)[i] = src_buffer.get()[vcl_size_t(i)+src_offset];
+}
+
+}
+} //backend
+} //viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/cuda.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/cuda.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/cuda.hpp
new file mode 100644
index 0000000..641bfea
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/cuda.hpp
@@ -0,0 +1,206 @@
+#ifndef VIENNACL_BACKEND_CUDA_HPP_
+#define VIENNACL_BACKEND_CUDA_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/cuda.hpp
+    @brief Implementations for the CUDA backend functionality
+*/
+
+
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <stdexcept>
+#include <sstream>
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/shared_ptr.hpp"
+
+// includes CUDA
+#include <cuda_runtime.h>
+
+#define VIENNACL_CUDA_ERROR_CHECK(err)  detail::cuda_error_check (err, __FILE__, __LINE__)
+
+namespace viennacl
+{
+namespace backend
+{
+namespace cuda
+{
+
+typedef viennacl::tools::shared_ptr<char>  handle_type;
+// Requirements for backend:
+
+// * memory_create(size, host_ptr)
+// * memory_copy(src, dest, offset_src, offset_dest, size)
+// * memory_write_from_main_memory(src, offset, size,
+//                                 dest, offset, size)
+// * memory_read_to_main_memory(src, offset, size
+//                              dest, offset, size)
+// *
+//
+
+class cuda_exception : public std::runtime_error
+{
+public:
+  cuda_exception(std::string const & what_arg, cudaError_t err_code) : std::runtime_error(what_arg), error_code_(err_code) {}
+
+  cudaError_t error_code() const { return error_code_; }
+
+private:
+  cudaError_t error_code_;
+};
+
+namespace detail
+{
+
+  inline void cuda_error_check(cudaError error_code, const char *file, const int line )
+  {
+    if (cudaSuccess != error_code)
+    {
+      std::stringstream ss;
+      ss << file << "(" << line << "): " << ": CUDA Runtime API error " << error_code << ": " << cudaGetErrorString( error_code ) << std::endl;
+      throw viennacl::backend::cuda::cuda_exception(ss.str(), error_code);
+    }
+  }
+
+
+  /** @brief Functor for deleting a CUDA handle. Used within the smart pointer class. */
+  template<typename U>
+  struct cuda_deleter
+  {
+    void operator()(U * p) const
+    {
+      //std::cout << "Freeing handle " << reinterpret_cast<void *>(p) << std::endl;
+      cudaFree(p);
+    }
+  };
+
+}
+
+/** @brief Creates an array of the specified size on the CUDA device. If the second argument is provided, the buffer is initialized with data from that pointer.
+ *
+ * @param size_in_bytes   Number of bytes to allocate
+ * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+ *
+ */
+inline handle_type  memory_create(vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+{
+  void * dev_ptr = NULL;
+  VIENNACL_CUDA_ERROR_CHECK( cudaMalloc(&dev_ptr, size_in_bytes) );
+  //std::cout << "Allocated new dev_ptr " << dev_ptr << " of size " <<  size_in_bytes << std::endl;
+
+  if (!host_ptr)
+    return handle_type(reinterpret_cast<char *>(dev_ptr), detail::cuda_deleter<char>());
+
+  handle_type new_handle(reinterpret_cast<char*>(dev_ptr), detail::cuda_deleter<char>());
+
+  // copy data:
+  //std::cout << "Filling new handle from host_ptr " << host_ptr << std::endl;
+  cudaMemcpy(new_handle.get(), host_ptr, size_in_bytes, cudaMemcpyHostToDevice);
+
+  return new_handle;
+}
+
+
+/** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' on the CUDA device to memory starting at address 'dst_buffer + dst_offset' on the same CUDA device.
+ *
+ *  @param src_buffer     A smart pointer to the begin of an allocated CUDA buffer
+ *  @param dst_buffer     A smart pointer to the end of an allocated CUDA buffer
+ *  @param src_offset     Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+ *  @param dst_offset     Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+ *  @param bytes_to_copy  Number of bytes to be copied
+ */
+inline void memory_copy(handle_type const & src_buffer,
+                        handle_type & dst_buffer,
+                        vcl_size_t src_offset,
+                        vcl_size_t dst_offset,
+                        vcl_size_t bytes_to_copy)
+{
+  assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+  assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+  cudaMemcpy(reinterpret_cast<void *>(dst_buffer.get() + dst_offset),
+             reinterpret_cast<void *>(src_buffer.get() + src_offset),
+             bytes_to_copy,
+             cudaMemcpyDeviceToDevice);
+}
+
+
+/** @brief Writes data from main RAM identified by 'ptr' to the CUDA buffer identified by 'dst_buffer'
+ *
+ * @param dst_buffer    A smart pointer to the beginning of an allocated CUDA buffer
+ * @param dst_offset    Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+ * @param bytes_to_copy Number of bytes to be copied
+ * @param ptr           Pointer to the first byte to be written
+ * @param async              Whether the operation should be asynchronous
+ */
+inline void memory_write(handle_type & dst_buffer,
+                         vcl_size_t dst_offset,
+                         vcl_size_t bytes_to_copy,
+                         const void * ptr,
+                         bool async = false)
+{
+  assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+  if (async)
+    cudaMemcpyAsync(reinterpret_cast<char *>(dst_buffer.get()) + dst_offset,
+                    reinterpret_cast<const char *>(ptr),
+                    bytes_to_copy,
+                    cudaMemcpyHostToDevice);
+  else
+    cudaMemcpy(reinterpret_cast<char *>(dst_buffer.get()) + dst_offset,
+               reinterpret_cast<const char *>(ptr),
+               bytes_to_copy,
+               cudaMemcpyHostToDevice);
+}
+
+
+/** @brief Reads data from a CUDA buffer back to main RAM.
+ *
+ * @param src_buffer         A smart pointer to the beginning of an allocated CUDA source buffer
+ * @param src_offset         Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+ * @param bytes_to_copy      Number of bytes to be read
+ * @param ptr                Location in main RAM where to read data should be written to
+ * @param async              Whether the operation should be asynchronous
+ */
+inline void memory_read(handle_type const & src_buffer,
+                        vcl_size_t src_offset,
+                        vcl_size_t bytes_to_copy,
+                        void * ptr,
+                        bool async = false)
+{
+  assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+  if (async)
+    cudaMemcpyAsync(reinterpret_cast<char *>(ptr),
+                    reinterpret_cast<char *>(src_buffer.get()) + src_offset,
+                    bytes_to_copy,
+                    cudaMemcpyDeviceToHost);
+  else
+    cudaMemcpy(reinterpret_cast<char *>(ptr),
+               reinterpret_cast<char *>(src_buffer.get()) + src_offset,
+               bytes_to_copy,
+               cudaMemcpyDeviceToHost);
+}
+
+} //cuda
+} //backend
+} //viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/mem_handle.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/mem_handle.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/mem_handle.hpp
new file mode 100644
index 0000000..37c680b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/mem_handle.hpp
@@ -0,0 +1,250 @@
+#ifndef VIENNACL_BACKEND_MEM_HANDLE_HPP
+#define VIENNACL_BACKEND_MEM_HANDLE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/mem_handle.hpp
+    @brief Implements the multi-memory-domain handle
+*/
+
+#include <vector>
+#include <cassert>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/shared_ptr.hpp"
+#include "viennacl/backend/cpu_ram.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/backend/opencl.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+#include "viennacl/backend/cuda.hpp"
+#endif
+
+
+namespace viennacl
+{
+namespace backend
+{
+
+namespace detail
+{
+  /** @brief Singleton for managing the default memory type.
+  *
+  * @param new_mem_type    If NULL, returns the current memory type. Otherwise, sets the memory type to the provided value.
+  */
+  inline memory_types get_set_default_memory_type(memory_types * new_mem_type)
+  {
+    // if a user compiles with CUDA, it is reasonable to expect that CUDA should be the default
+#ifdef VIENNACL_WITH_CUDA
+    static memory_types mem_type = CUDA_MEMORY;
+#elif defined(VIENNACL_WITH_OPENCL)
+    static memory_types mem_type = OPENCL_MEMORY;
+#else
+    static memory_types mem_type = MAIN_MEMORY;
+#endif
+
+    if (new_mem_type)
+      mem_type = *new_mem_type;
+
+    return mem_type;
+  }
+}
+
+/** @brief Returns the default memory type for the given configuration.
+ *
+ * CUDA has precedence over OpenCL, which has precedence over main memory. Depends on which VIENNACL_WITH_{CUDA/OPENCL/OPENMP} macros are defined.
+ */
+inline memory_types default_memory_type() { return detail::get_set_default_memory_type(NULL); }
+
+/** @brief Sets the default memory type for the given configuration.
+ *
+ * Make sure the respective new memory type is enabled.
+ * For example, passing CUDA_MEMORY if no CUDA backend is selected will result in exceptions being thrown as soon as you try to allocate buffers.
+ */
+inline memory_types default_memory_type(memory_types new_memory_type) { return detail::get_set_default_memory_type(&new_memory_type); }
+
+
+/** @brief Main abstraction class for multiple memory domains. Represents a buffer in either main RAM, an OpenCL context, or a CUDA device.
+ *
+ * The idea is to wrap all possible handle types inside this class so that higher-level code does not need to be cluttered with preprocessor switches.
+ * Instead, this class collects all the necessary conditional compilations.
+ *
+ */
+class mem_handle
+{
+public:
+  typedef viennacl::tools::shared_ptr<char>      ram_handle_type;
+  typedef viennacl::tools::shared_ptr<char>      cuda_handle_type;
+
+  /** @brief Default CTOR. No memory is allocated */
+  mem_handle() : active_handle_(MEMORY_NOT_INITIALIZED), size_in_bytes_(0) {}
+
+  /** @brief Returns the handle to a buffer in CPU RAM. NULL is returned if no such buffer has been allocated. */
+  ram_handle_type       & ram_handle()       { return ram_handle_; }
+  /** @brief Returns the handle to a buffer in CPU RAM. NULL is returned if no such buffer has been allocated. */
+  ram_handle_type const & ram_handle() const { return ram_handle_; }
+
+#ifdef VIENNACL_WITH_OPENCL
+  /** @brief Returns the handle to an OpenCL buffer. The handle contains NULL if no such buffer has been allocated. */
+  viennacl::ocl::handle<cl_mem>       & opencl_handle()       { return opencl_handle_; }
+  /** @brief Returns the handle to an OpenCL buffer. The handle contains NULL if no such buffer has been allocated. */
+  viennacl::ocl::handle<cl_mem> const & opencl_handle() const { return opencl_handle_; }
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  /** @brief Returns the handle to a CUDA buffer. The handle contains NULL if no such buffer has been allocated. */
+  cuda_handle_type       & cuda_handle()       { return cuda_handle_; }
+  /** @brief Returns the handle to a CUDA buffer. The handle contains NULL if no such buffer has been allocated. */
+  cuda_handle_type const & cuda_handle() const { return cuda_handle_; }
+#endif
+
+  /** @brief Returns an ID for the currently active memory buffer. Other memory buffers might contain old or no data. */
+  memory_types get_active_handle_id() const { return active_handle_; }
+
+  /** @brief Switches the currently active handle. If no support for that backend is provided, an exception is thrown. */
+  void switch_active_handle_id(memory_types new_id)
+  {
+    if (new_id != active_handle_)
+    {
+      if (active_handle_ == MEMORY_NOT_INITIALIZED)
+        active_handle_ = new_id;
+      else if (active_handle_ == MAIN_MEMORY)
+      {
+        active_handle_ = new_id;
+      }
+      else if (active_handle_ == OPENCL_MEMORY)
+      {
+#ifdef VIENNACL_WITH_OPENCL
+        active_handle_ = new_id;
+#else
+        throw memory_exception("compiled without OpenCL suppport!");
+#endif
+      }
+      else if (active_handle_ == CUDA_MEMORY)
+      {
+#ifdef VIENNACL_WITH_CUDA
+        active_handle_ = new_id;
+#else
+        throw memory_exception("compiled without CUDA suppport!");
+#endif
+      }
+      else
+        throw memory_exception("invalid new memory region!");
+    }
+  }
+
+  /** @brief Compares the two handles and returns true if the active memory handles in the two mem_handles point to the same buffer. */
+  bool operator==(mem_handle const & other) const
+  {
+    if (active_handle_ != other.active_handle_)
+      return false;
+
+    switch (active_handle_)
+    {
+    case MAIN_MEMORY:
+      return ram_handle_.get() == other.ram_handle_.get();
+#ifdef VIENNACL_WITH_OPENCL
+    case OPENCL_MEMORY:
+      return opencl_handle_.get() == other.opencl_handle_.get();
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case CUDA_MEMORY:
+      return cuda_handle_.get() == other.cuda_handle_.get();
+#endif
+    default: break;
+    }
+
+    return false;
+  }
+
+  /** @brief Compares the two handles and returns true if the active memory handles in the two mem_handles point a buffer with inferior address
+     * useful to store handles into a map, since they naturally have strong ordering
+     */
+  bool operator<(mem_handle const & other) const
+  {
+    if (active_handle_ != other.active_handle_)
+      return false;
+
+    switch (active_handle_)
+    {
+    case MAIN_MEMORY:
+      return ram_handle_.get() < other.ram_handle_.get();
+#ifdef VIENNACL_WITH_OPENCL
+    case OPENCL_MEMORY:
+      return opencl_handle_.get() < other.opencl_handle_.get();
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case CUDA_MEMORY:
+      return cuda_handle_.get() < other.cuda_handle_.get();
+#endif
+    default: break;
+    }
+
+    return false;
+  }
+
+
+  bool operator!=(mem_handle const & other) const { return !(*this == other); }
+
+  /** @brief Implements a fast swapping method. No data is copied, only the handles are exchanged. */
+  void swap(mem_handle & other)
+  {
+    // swap handle type:
+    memory_types active_handle_tmp = other.active_handle_;
+    other.active_handle_ = active_handle_;
+    active_handle_ = active_handle_tmp;
+
+    // swap ram handle:
+    ram_handle_type ram_handle_tmp = other.ram_handle_;
+    other.ram_handle_ = ram_handle_;
+    ram_handle_ = ram_handle_tmp;
+
+    // swap OpenCL handle:
+#ifdef VIENNACL_WITH_OPENCL
+    opencl_handle_.swap(other.opencl_handle_);
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    cuda_handle_type cuda_handle_tmp = other.cuda_handle_;
+    other.cuda_handle_ = cuda_handle_;
+    cuda_handle_ = cuda_handle_tmp;
+#endif
+  }
+
+  /** @brief Returns the number of bytes of the currently active buffer */
+  vcl_size_t raw_size() const               { return size_in_bytes_; }
+
+  /** @brief Sets the size of the currently active buffer. Use with care! */
+  void        raw_size(vcl_size_t new_size) { size_in_bytes_ = new_size; }
+
+private:
+  memory_types active_handle_;
+  ram_handle_type ram_handle_;
+#ifdef VIENNACL_WITH_OPENCL
+  viennacl::ocl::handle<cl_mem> opencl_handle_;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  cuda_handle_type        cuda_handle_;
+#endif
+  vcl_size_t size_in_bytes_;
+};
+
+
+} //backend
+} //viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/memory.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/memory.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/memory.hpp
new file mode 100644
index 0000000..d6f29a5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/memory.hpp
@@ -0,0 +1,628 @@
+#ifndef VIENNACL_BACKEND_MEMORY_HPP
+#define VIENNACL_BACKEND_MEMORY_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/memory.hpp
+    @brief Main interface routines for memory management
+*/
+
+#include <vector>
+#include <cassert>
+#include "viennacl/forwards.h"
+#include "viennacl/backend/mem_handle.hpp"
+#include "viennacl/context.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/context.hpp"
+#include "viennacl/backend/util.hpp"
+
+#include "viennacl/backend/cpu_ram.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/backend/opencl.hpp"
+#include "viennacl/ocl/backend.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+#include "viennacl/backend/cuda.hpp"
+#endif
+
+
+namespace viennacl
+{
+namespace backend
+{
+
+
+  // if a user compiles with CUDA, it is reasonable to expect that CUDA should be the default
+  /** @brief Synchronizes the execution. finish() will only return after all compute kernels (CUDA, OpenCL) have completed. */
+  inline void finish()
+  {
+#ifdef VIENNACL_WITH_CUDA
+    cudaDeviceSynchronize();
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+    viennacl::ocl::get_queue().finish();
+#endif
+  }
+
+
+
+
+  // Requirements for backend:
+
+  // ---- Memory ----
+  //
+  // * memory_create(size, host_ptr)
+  // * memory_copy(src, dest, offset_src, offset_dest, size)
+  // * memory_write(src, offset, size, ptr)
+  // * memory_read(src, offset, size, ptr)
+  //
+
+  /** @brief Creates an array of the specified size. If the second argument is provided, the buffer is initialized with data from that pointer.
+  *
+  * This is the generic version for CPU RAM, CUDA, and OpenCL. Creates the memory in the currently active memory domain.
+  *
+  * @param handle          The generic wrapper handle for multiple memory domains which will hold the new buffer.
+  * @param size_in_bytes   Number of bytes to allocate
+  * @param ctx             Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+  * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+  *
+  */
+  inline void memory_create(mem_handle & handle, vcl_size_t size_in_bytes, viennacl::context const & ctx, const void * host_ptr = NULL)
+  {
+    if (size_in_bytes > 0)
+    {
+      if (handle.get_active_handle_id() == MEMORY_NOT_INITIALIZED)
+        handle.switch_active_handle_id(ctx.memory_type());
+
+      switch (handle.get_active_handle_id())
+      {
+      case MAIN_MEMORY:
+        handle.ram_handle() = cpu_ram::memory_create(size_in_bytes, host_ptr);
+        handle.raw_size(size_in_bytes);
+        break;
+#ifdef VIENNACL_WITH_OPENCL
+      case OPENCL_MEMORY:
+        handle.opencl_handle().context(ctx.opencl_context());
+        handle.opencl_handle() = opencl::memory_create(handle.opencl_handle().context(), size_in_bytes, host_ptr);
+        handle.raw_size(size_in_bytes);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case CUDA_MEMORY:
+        handle.cuda_handle() = cuda::memory_create(size_in_bytes, host_ptr);
+        handle.raw_size(size_in_bytes);
+        break;
+#endif
+      case MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("unknown memory handle!");
+      }
+    }
+  }
+
+  /*
+  inline void memory_create(mem_handle & handle, vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+  {
+    viennacl::context  ctx(default_memory_type());
+    memory_create(handle, size_in_bytes, ctx, host_ptr);
+  }*/
+
+
+  /** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' to memory starting at address 'dst_buffer + dst_offset'.
+  *
+  * This is the generic version for CPU RAM, CUDA, and OpenCL. Copies the memory in the currently active memory domain.
+  *
+  *
+  *  @param src_buffer     A smart pointer to the begin of an allocated buffer
+  *  @param dst_buffer     A smart pointer to the end of an allocated buffer
+  *  @param src_offset     Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+  *  @param dst_offset     Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+  *  @param bytes_to_copy  Number of bytes to be copied
+  */
+  inline void memory_copy(mem_handle const & src_buffer,
+                          mem_handle & dst_buffer,
+                          vcl_size_t src_offset,
+                          vcl_size_t dst_offset,
+                          vcl_size_t bytes_to_copy)
+  {
+    assert( src_buffer.get_active_handle_id() == dst_buffer.get_active_handle_id() && bool("memory_copy() must be called on buffers from the same domain") );
+
+    if (bytes_to_copy > 0)
+    {
+      switch (src_buffer.get_active_handle_id())
+      {
+      case MAIN_MEMORY:
+        cpu_ram::memory_copy(src_buffer.ram_handle(), dst_buffer.ram_handle(), src_offset, dst_offset, bytes_to_copy);
+        break;
+#ifdef VIENNACL_WITH_OPENCL
+      case OPENCL_MEMORY:
+        opencl::memory_copy(src_buffer.opencl_handle(), dst_buffer.opencl_handle(), src_offset, dst_offset, bytes_to_copy);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case CUDA_MEMORY:
+        cuda::memory_copy(src_buffer.cuda_handle(), dst_buffer.cuda_handle(), src_offset, dst_offset, bytes_to_copy);
+        break;
+#endif
+      case MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("unknown memory handle!");
+      }
+    }
+  }
+
+  // TODO: Refine this concept. Maybe move to constructor?
+  /** @brief A 'shallow' copy operation from an initialized buffer to an uninitialized buffer.
+   * The uninitialized buffer just copies the raw handle.
+   */
+  inline void memory_shallow_copy(mem_handle const & src_buffer,
+                                  mem_handle & dst_buffer)
+  {
+    assert( (dst_buffer.get_active_handle_id() == MEMORY_NOT_INITIALIZED) && bool("Shallow copy on already initialized memory not supported!"));
+
+    switch (src_buffer.get_active_handle_id())
+    {
+    case MAIN_MEMORY:
+      dst_buffer.switch_active_handle_id(src_buffer.get_active_handle_id());
+      dst_buffer.ram_handle() = src_buffer.ram_handle();
+      dst_buffer.raw_size(src_buffer.raw_size());
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case OPENCL_MEMORY:
+      dst_buffer.switch_active_handle_id(src_buffer.get_active_handle_id());
+      dst_buffer.opencl_handle() = src_buffer.opencl_handle();
+      dst_buffer.raw_size(src_buffer.raw_size());
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case CUDA_MEMORY:
+      dst_buffer.switch_active_handle_id(src_buffer.get_active_handle_id());
+      dst_buffer.cuda_handle() = src_buffer.cuda_handle();
+      dst_buffer.raw_size(src_buffer.raw_size());
+      break;
+#endif
+    case MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("unknown memory handle!");
+    }
+  }
+
+  /** @brief Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'
+  *
+  * This is the generic version for CPU RAM, CUDA, and OpenCL. Writes the memory in the currently active memory domain.
+  *
+  * @param dst_buffer     A smart pointer to the beginning of an allocated buffer
+  * @param dst_offset     Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+  * @param bytes_to_write Number of bytes to be written
+  * @param ptr            Pointer to the first byte to be written
+  * @param async              Whether the operation should be asynchronous
+  */
+  inline void memory_write(mem_handle & dst_buffer,
+                           vcl_size_t dst_offset,
+                           vcl_size_t bytes_to_write,
+                           const void * ptr,
+                           bool async = false)
+  {
+    if (bytes_to_write > 0)
+    {
+      switch (dst_buffer.get_active_handle_id())
+      {
+      case MAIN_MEMORY:
+        cpu_ram::memory_write(dst_buffer.ram_handle(), dst_offset, bytes_to_write, ptr, async);
+        break;
+#ifdef VIENNACL_WITH_OPENCL
+      case OPENCL_MEMORY:
+        opencl::memory_write(dst_buffer.opencl_handle(), dst_offset, bytes_to_write, ptr, async);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case CUDA_MEMORY:
+        cuda::memory_write(dst_buffer.cuda_handle(), dst_offset, bytes_to_write, ptr, async);
+        break;
+#endif
+      case MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("unknown memory handle!");
+      }
+    }
+  }
+
+  /** @brief Reads data from a buffer back to main RAM.
+  *
+  * This is the generic version for CPU RAM, CUDA, and OpenCL. Reads the memory from the currently active memory domain.
+  *
+  * @param src_buffer         A smart pointer to the beginning of an allocated source buffer
+  * @param src_offset         Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+  * @param bytes_to_read      Number of bytes to be read
+  * @param ptr                Location in main RAM where to read data should be written to
+  * @param async              Whether the operation should be asynchronous
+  */
+  inline void memory_read(mem_handle const & src_buffer,
+                          vcl_size_t src_offset,
+                          vcl_size_t bytes_to_read,
+                          void * ptr,
+                          bool async = false)
+  {
+    //finish(); //Fixes some issues with AMD APP SDK. However, might sacrifice a few percents of performance in some cases.
+
+    if (bytes_to_read > 0)
+    {
+      switch (src_buffer.get_active_handle_id())
+      {
+      case MAIN_MEMORY:
+        cpu_ram::memory_read(src_buffer.ram_handle(), src_offset, bytes_to_read, ptr, async);
+        break;
+#ifdef VIENNACL_WITH_OPENCL
+      case OPENCL_MEMORY:
+        opencl::memory_read(src_buffer.opencl_handle(), src_offset, bytes_to_read, ptr, async);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case CUDA_MEMORY:
+        cuda::memory_read(src_buffer.cuda_handle(), src_offset, bytes_to_read, ptr, async);
+        break;
+#endif
+      case MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("unknown memory handle!");
+      }
+    }
+  }
+
+
+
+  namespace detail
+  {
+    template<typename T>
+    vcl_size_t element_size(memory_types /* mem_type */)
+    {
+      return sizeof(T);
+    }
+
+
+    template<>
+    inline vcl_size_t element_size<unsigned long>(memory_types
+                                            #ifdef VIENNACL_WITH_OPENCL
+                                                  mem_type  //in order to compile cleanly at -Wextra in GCC
+                                            #endif
+                                                  )
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      if (mem_type == OPENCL_MEMORY)
+        return sizeof(cl_ulong);
+#endif
+      return sizeof(unsigned long);
+    }
+
+    template<>
+    inline vcl_size_t element_size<long>(memory_types
+                                   #ifdef VIENNACL_WITH_OPENCL
+                                         mem_type  //in order to compile cleanly at -Wextra in GCC
+                                   #endif
+                                         )
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      if (mem_type == OPENCL_MEMORY)
+        return sizeof(cl_long);
+#endif
+      return sizeof(long);
+    }
+
+
+    template<>
+    inline vcl_size_t element_size<unsigned int>(memory_types
+                                           #ifdef VIENNACL_WITH_OPENCL
+                                                 mem_type  //in order to compile cleanly at -Wextra in GCC
+                                           #endif
+                                                 )
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      if (mem_type == OPENCL_MEMORY)
+        return sizeof(cl_uint);
+#endif
+      return sizeof(unsigned int);
+    }
+
+    template<>
+    inline vcl_size_t element_size<int>(memory_types
+                                  #ifdef VIENNACL_WITH_OPENCL
+                                        mem_type  //in order to compile cleanly at -Wextra in GCC
+                                  #endif
+                                        )
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      if (mem_type == OPENCL_MEMORY)
+        return sizeof(cl_int);
+#endif
+      return sizeof(int);
+    }
+
+
+  }
+
+
+  /** @brief Switches the active memory domain within a memory handle. Data is copied if the new active domain differs from the old one. Memory in the source handle is not free'd. */
+  template<typename DataType>
+  void switch_memory_context(mem_handle & handle, viennacl::context new_ctx)
+  {
+    if (handle.get_active_handle_id() == new_ctx.memory_type())
+      return;
+
+    if (handle.get_active_handle_id() == viennacl::MEMORY_NOT_INITIALIZED || handle.raw_size() == 0)
+    {
+      handle.switch_active_handle_id(new_ctx.memory_type());
+#ifdef VIENNACL_WITH_OPENCL
+      if (new_ctx.memory_type() == OPENCL_MEMORY)
+        handle.opencl_handle().context(new_ctx.opencl_context());
+#endif
+      return;
+    }
+
+    vcl_size_t size_dst = detail::element_size<DataType>(handle.get_active_handle_id());
+    vcl_size_t size_src = detail::element_size<DataType>(new_ctx.memory_type());
+
+    if (size_dst != size_src)  // OpenCL data element size not the same as host data element size
+    {
+      throw memory_exception("Heterogeneous data element sizes not yet supported!");
+    }
+    else //no data conversion required
+    {
+      if (handle.get_active_handle_id() == MAIN_MEMORY) //we can access the existing data directly
+      {
+        switch (new_ctx.memory_type())
+        {
+#ifdef VIENNACL_WITH_OPENCL
+        case OPENCL_MEMORY:
+          handle.opencl_handle().context(new_ctx.opencl_context());
+          handle.opencl_handle() = opencl::memory_create(handle.opencl_handle().context(), handle.raw_size(), handle.ram_handle().get());
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case CUDA_MEMORY:
+          handle.cuda_handle() = cuda::memory_create(handle.raw_size(), handle.ram_handle().get());
+          break;
+#endif
+        case MAIN_MEMORY:
+        default:
+          throw memory_exception("Invalid destination domain");
+        }
+      }
+#ifdef VIENNACL_WITH_OPENCL
+      else if (handle.get_active_handle_id() == OPENCL_MEMORY) // data can be dumped into destination directly
+      {
+        std::vector<DataType> buffer;
+
+        switch (new_ctx.memory_type())
+        {
+        case MAIN_MEMORY:
+          handle.ram_handle() = cpu_ram::memory_create(handle.raw_size());
+          opencl::memory_read(handle.opencl_handle(), 0, handle.raw_size(), handle.ram_handle().get());
+          break;
+#ifdef VIENNACL_WITH_CUDA
+        case CUDA_MEMORY:
+          buffer.resize(handle.raw_size() / sizeof(DataType));
+          opencl::memory_read(handle.opencl_handle(), 0, handle.raw_size(), &(buffer[0]));
+          cuda::memory_create(handle.cuda_handle(), handle.raw_size(), &(buffer[0]));
+          break;
+#endif
+        default:
+          throw memory_exception("Invalid destination domain");
+        }
+      }
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      else //CUDA_MEMORY
+      {
+        std::vector<DataType> buffer;
+
+        // write
+        switch (new_ctx.memory_type())
+        {
+        case MAIN_MEMORY:
+          handle.ram_handle() = cpu_ram::memory_create(handle.raw_size());
+          cuda::memory_read(handle.cuda_handle(), 0, handle.raw_size(), handle.ram_handle().get());
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case OPENCL_MEMORY:
+          buffer.resize(handle.raw_size() / sizeof(DataType));
+          cuda::memory_read(handle.cuda_handle(), 0, handle.raw_size(), &(buffer[0]));
+          handle.opencl_handle() = opencl::memory_create(handle.raw_size(), &(buffer[0]));
+          break;
+#endif
+        default:
+          throw memory_exception("Unsupported source memory domain");
+        }
+      }
+#endif
+
+      // everything succeeded so far, now switch to new domain:
+      handle.switch_active_handle_id(new_ctx.memory_type());
+
+    } // no data conversion
+  }
+
+
+
+  /** @brief Copies data of the provided 'DataType' from 'handle_src' to 'handle_dst' and converts the data if the binary representation of 'DataType' among the memory domains differs. */
+  template<typename DataType>
+  void typesafe_memory_copy(mem_handle const & handle_src, mem_handle & handle_dst)
+  {
+    if (handle_dst.get_active_handle_id() == MEMORY_NOT_INITIALIZED)
+      handle_dst.switch_active_handle_id(default_memory_type());
+
+    vcl_size_t element_size_src = detail::element_size<DataType>(handle_src.get_active_handle_id());
+    vcl_size_t element_size_dst = detail::element_size<DataType>(handle_dst.get_active_handle_id());
+
+    if (element_size_src != element_size_dst)
+    {
+      // Data needs to be converted.
+
+      typesafe_host_array<DataType> buffer_src(handle_src);
+      typesafe_host_array<DataType> buffer_dst(handle_dst, handle_src.raw_size() / element_size_src);
+
+      //
+      // Step 1: Fill buffer_dst depending on where the data resides:
+      //
+      DataType const * src_data;
+      switch (handle_src.get_active_handle_id())
+      {
+      case MAIN_MEMORY:
+        src_data = reinterpret_cast<DataType const *>(handle_src.ram_handle().get());
+        for (vcl_size_t i=0; i<buffer_dst.size(); ++i)
+          buffer_dst.set(i, src_data[i]);
+        break;
+
+#ifdef VIENNACL_WITH_OPENCL
+      case OPENCL_MEMORY:
+        buffer_src.resize(handle_src, handle_src.raw_size() / element_size_src);
+        opencl::memory_read(handle_src.opencl_handle(), 0, buffer_src.raw_size(), buffer_src.get());
+        for (vcl_size_t i=0; i<buffer_dst.size(); ++i)
+          buffer_dst.set(i, buffer_src[i]);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case CUDA_MEMORY:
+        buffer_src.resize(handle_src, handle_src.raw_size() / element_size_src);
+        cuda::memory_read(handle_src.cuda_handle(), 0, buffer_src.raw_size(), buffer_src.get());
+        for (vcl_size_t i=0; i<buffer_dst.size(); ++i)
+          buffer_dst.set(i, buffer_src[i]);
+        break;
+#endif
+
+      default:
+        throw memory_exception("unsupported memory domain");
+      }
+
+      //
+      // Step 2: Write to destination
+      //
+      if (handle_dst.raw_size() == buffer_dst.raw_size())
+        viennacl::backend::memory_write(handle_dst, 0, buffer_dst.raw_size(), buffer_dst.get());
+      else
+        viennacl::backend::memory_create(handle_dst, buffer_dst.raw_size(), viennacl::traits::context(handle_dst), buffer_dst.get());
+
+    }
+    else
+    {
+      // No data conversion required.
+      typesafe_host_array<DataType> buffer(handle_src);
+
+      switch (handle_src.get_active_handle_id())
+      {
+      case MAIN_MEMORY:
+        switch (handle_dst.get_active_handle_id())
+        {
+        case MAIN_MEMORY:
+        case OPENCL_MEMORY:
+        case CUDA_MEMORY:
+          if (handle_dst.raw_size() == handle_src.raw_size())
+            viennacl::backend::memory_write(handle_dst, 0, handle_src.raw_size(), handle_src.ram_handle().get());
+          else
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst), handle_src.ram_handle().get());
+          break;
+
+        default:
+          throw memory_exception("unsupported destination memory domain");
+        }
+        break;
+
+      case OPENCL_MEMORY:
+        switch (handle_dst.get_active_handle_id())
+        {
+        case MAIN_MEMORY:
+          if (handle_dst.raw_size() != handle_src.raw_size())
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+          viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), handle_dst.ram_handle().get());
+          break;
+
+        case OPENCL_MEMORY:
+          if (handle_dst.raw_size() != handle_src.raw_size())
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+          viennacl::backend::memory_copy(handle_src, handle_dst, 0, 0, handle_src.raw_size());
+          break;
+
+        case CUDA_MEMORY:
+          if (handle_dst.raw_size() != handle_src.raw_size())
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+          buffer.resize(handle_src, handle_src.raw_size() / element_size_src);
+          viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), buffer.get());
+          viennacl::backend::memory_write(handle_dst, 0, handle_src.raw_size(), buffer.get());
+          break;
+
+        default:
+          throw memory_exception("unsupported destination memory domain");
+        }
+        break;
+
+      case CUDA_MEMORY:
+        switch (handle_dst.get_active_handle_id())
+        {
+        case MAIN_MEMORY:
+          if (handle_dst.raw_size() != handle_src.raw_size())
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+          viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), handle_dst.ram_handle().get());
+          break;
+
+        case OPENCL_MEMORY:
+          if (handle_dst.raw_size() != handle_src.raw_size())
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+          buffer.resize(handle_src, handle_src.raw_size() / element_size_src);
+          viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), buffer.get());
+          viennacl::backend::memory_write(handle_dst, 0, handle_src.raw_size(), buffer.get());
+          break;
+
+        case CUDA_MEMORY:
+          if (handle_dst.raw_size() != handle_src.raw_size())
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+          viennacl::backend::memory_copy(handle_src, handle_dst, 0, 0, handle_src.raw_size());
+          break;
+
+        default:
+          throw memory_exception("unsupported destination memory domain");
+        }
+        break;
+
+      default:
+        throw memory_exception("unsupported source memory domain");
+      }
+
+    }
+  }
+
+
+} //backend
+
+//
+// Convenience layer:
+//
+/** @brief Generic convenience routine for migrating data of an object to a new memory domain */
+template<typename T>
+void switch_memory_context(T & obj, viennacl::context new_ctx)
+{
+  obj.switch_memory_context(new_ctx);
+}
+
+} //viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/opencl.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/opencl.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/opencl.hpp
new file mode 100644
index 0000000..a8be55a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/opencl.hpp
@@ -0,0 +1,151 @@
+#ifndef VIENNACL_BACKEND_OPENCL_HPP_
+#define VIENNACL_BACKEND_OPENCL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/opencl.hpp
+    @brief Implementations for the OpenCL backend functionality
+*/
+
+
+#include <vector>
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/backend.hpp"
+
+namespace viennacl
+{
+namespace backend
+{
+namespace opencl
+{
+
+// Requirements for backend:
+
+// * memory_create(size, host_ptr)
+// * memory_copy(src, dest, offset_src, offset_dest, size)
+// * memory_write_from_main_memory(src, offset, size,
+//                                 dest, offset, size)
+// * memory_read_to_main_memory(src, offset, size
+//                              dest, offset, size)
+// *
+//
+
+/** @brief Creates an array of the specified size in the current OpenCL context. If the second argument is provided, the buffer is initialized with data from that pointer.
+ *
+ * @param size_in_bytes   Number of bytes to allocate
+ * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+ * @param ctx             Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+ *
+ */
+inline cl_mem memory_create(viennacl::ocl::context const & ctx, vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+{
+  //std::cout << "Creating buffer (" << size_in_bytes << " bytes) host buffer " << host_ptr << " in context " << &ctx << std::endl;
+  return ctx.create_memory_without_smart_handle(CL_MEM_READ_WRITE, static_cast<unsigned int>(size_in_bytes), const_cast<void *>(host_ptr));
+}
+
+/** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' in the OpenCL context to memory starting at address 'dst_buffer + dst_offset' in the same OpenCL context.
+ *
+ *  @param src_buffer     A smart pointer to the begin of an allocated OpenCL buffer
+ *  @param dst_buffer     A smart pointer to the end of an allocated OpenCL buffer
+ *  @param src_offset     Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+ *  @param dst_offset     Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+ *  @param bytes_to_copy  Number of bytes to be copied
+ */
+inline void memory_copy(viennacl::ocl::handle<cl_mem> const & src_buffer,
+                        viennacl::ocl::handle<cl_mem> & dst_buffer,
+                        vcl_size_t src_offset,
+                        vcl_size_t dst_offset,
+                        vcl_size_t bytes_to_copy)
+{
+  assert( &src_buffer.context() == &dst_buffer.context() && bool("Transfer between memory buffers in different contexts not supported yet!"));
+
+  viennacl::ocl::context & memory_context = const_cast<viennacl::ocl::context &>(src_buffer.context());
+  cl_int err = clEnqueueCopyBuffer(memory_context.get_queue().handle().get(),
+                                   src_buffer.get(),
+                                   dst_buffer.get(),
+                                   src_offset,
+                                   dst_offset,
+                                   bytes_to_copy,
+                                   0, NULL, NULL);  //events
+  VIENNACL_ERR_CHECK(err);
+}
+
+
+/** @brief Writes data from main RAM identified by 'ptr' to the OpenCL buffer identified by 'dst_buffer'
+ *
+ * @param dst_buffer    A smart pointer to the beginning of an allocated OpenCL buffer
+ * @param dst_offset    Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+ * @param bytes_to_copy Number of bytes to be copied
+ * @param ptr           Pointer to the first byte to be written
+ * @param async         Whether the operation should be asynchronous
+ */
+inline void memory_write(viennacl::ocl::handle<cl_mem> & dst_buffer,
+                         vcl_size_t dst_offset,
+                         vcl_size_t bytes_to_copy,
+                         const void * ptr,
+                         bool async = false)
+{
+
+  viennacl::ocl::context & memory_context = const_cast<viennacl::ocl::context &>(dst_buffer.context());
+
+#if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_DEVICE)
+  std::cout << "Writing data (" << bytes_to_copy << " bytes, offset " << dst_offset << ") to OpenCL buffer " << dst_buffer.get() << " with queue " << memory_context.get_queue().handle().get() << " from " << ptr << std::endl;
+#endif
+
+  cl_int err = clEnqueueWriteBuffer(memory_context.get_queue().handle().get(),
+                                    dst_buffer.get(),
+                                    async ? CL_FALSE : CL_TRUE,             //blocking
+                                    dst_offset,
+                                    bytes_to_copy,
+                                    ptr,
+                                    0, NULL, NULL);      //events
+  VIENNACL_ERR_CHECK(err);
+}
+
+
+/** @brief Reads data from an OpenCL buffer back to main RAM.
+ *
+ * @param src_buffer         A smart pointer to the beginning of an allocated OpenCL source buffer
+ * @param src_offset         Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+ * @param bytes_to_copy      Number of bytes to be read
+ * @param ptr                Location in main RAM where to read data should be written to
+ * @param async         Whether the operation should be asynchronous
+ */
+inline void memory_read(viennacl::ocl::handle<cl_mem> const & src_buffer,
+                        vcl_size_t src_offset,
+                        vcl_size_t bytes_to_copy,
+                        void * ptr,
+                        bool async = false)
+{
+  //std::cout << "Reading data (" << bytes_to_copy << " bytes, offset " << src_offset << ") from OpenCL buffer " << src_buffer.get() << " to " << ptr << std::endl;
+  viennacl::ocl::context & memory_context = const_cast<viennacl::ocl::context &>(src_buffer.context());
+  cl_int err =  clEnqueueReadBuffer(memory_context.get_queue().handle().get(),
+                                    src_buffer.get(),
+                                    async ? CL_FALSE : CL_TRUE,             //blocking
+                                    src_offset,
+                                    bytes_to_copy,
+                                    ptr,
+                                    0, NULL, NULL);      //events
+  VIENNACL_ERR_CHECK(err);
+}
+
+
+}
+} //backend
+} //viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/util.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/util.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/util.hpp
new file mode 100644
index 0000000..9aaeb2e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/util.hpp
@@ -0,0 +1,268 @@
+#ifndef VIENNACL_BACKEND_UTIL_HPP
+#define VIENNACL_BACKEND_UTIL_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/util.hpp
+    @brief Helper functionality for working with different memory domains
+*/
+
+#include <vector>
+#include <cassert>
+
+#include "viennacl/forwards.h"
+#include "viennacl/backend/mem_handle.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/backend/opencl.hpp"
+#endif
+
+
+namespace viennacl
+{
+namespace backend
+{
+namespace detail
+{
+
+  /** @brief Helper struct for converting a type to its OpenCL pendant. */
+  template<typename T>
+  struct convert_to_opencl
+  {
+    typedef T    type;
+    enum { special = 0 };
+  };
+
+#ifdef VIENNACL_WITH_OPENCL
+  template<>
+  struct convert_to_opencl<unsigned int>
+  {
+    typedef cl_uint    type;
+    //enum { special = (sizeof(unsigned int) != sizeof(cl_uint)) };
+    enum { special = 1 };
+  };
+
+  template<>
+  struct convert_to_opencl<int>
+  {
+    typedef cl_int    type;
+    //enum { special = (sizeof(int) != sizeof(cl_int)) };
+    enum { special = 1 };
+  };
+
+
+  template<>
+  struct convert_to_opencl<unsigned long>
+  {
+    typedef cl_ulong    type;
+    //enum { special = (sizeof(unsigned long) != sizeof(cl_ulong)) };
+    enum { special = 1 };
+  };
+
+  template<>
+  struct convert_to_opencl<long>
+  {
+    typedef cl_long    type;
+    //enum { special = (sizeof(long) != sizeof(cl_long)) };
+    enum { special = 1 };
+  };
+#endif
+
+
+} //namespace detail
+
+
+/** @brief Helper class implementing an array on the host. Default case: No conversion necessary */
+template<typename T, bool special = detail::convert_to_opencl<T>::special>
+class typesafe_host_array
+{
+  typedef T                                              cpu_type;
+  typedef typename detail::convert_to_opencl<T>::type    target_type;
+
+public:
+  explicit typesafe_host_array() : bytes_buffer_(NULL), buffer_size_(0) {}
+
+  explicit typesafe_host_array(mem_handle const & handle, vcl_size_t num = 0) : bytes_buffer_(NULL), buffer_size_(sizeof(cpu_type) * num)
+  {
+    resize(handle, num);
+  }
+
+  ~typesafe_host_array() { delete[] bytes_buffer_; }
+
+  //
+  // Setter and Getter
+  //
+  void * get() { return reinterpret_cast<void *>(bytes_buffer_); }
+  vcl_size_t raw_size() const { return buffer_size_; }
+  vcl_size_t element_size() const  {  return sizeof(cpu_type); }
+  vcl_size_t size() const { return buffer_size_ / element_size(); }
+  template<typename U>
+  void set(vcl_size_t index, U value)
+  {
+    reinterpret_cast<cpu_type *>(bytes_buffer_)[index] = static_cast<cpu_type>(value);
+  }
+
+  //
+  // Resize functionality
+  //
+
+  /** @brief Resize without initializing the new memory */
+  void raw_resize(mem_handle const & /*handle*/, vcl_size_t num)
+  {
+    buffer_size_ = sizeof(cpu_type) * num;
+
+    if (num > 0)
+    {
+      delete[] bytes_buffer_;
+
+      bytes_buffer_ = new char[buffer_size_];
+    }
+  }
+
+  /** @brief Resize including initialization of new memory (cf. std::vector<>) */
+  void resize(mem_handle const & handle, vcl_size_t num)
+  {
+    raw_resize(handle, num);
+
+    if (num > 0)
+    {
+      for (vcl_size_t i=0; i<buffer_size_; ++i)
+        bytes_buffer_[i] = 0;
+    }
+  }
+
+  cpu_type operator[](vcl_size_t index) const
+  {
+    assert(index < size() && bool("index out of bounds"));
+
+    return reinterpret_cast<cpu_type *>(bytes_buffer_)[index];
+  }
+
+private:
+  char * bytes_buffer_;
+  vcl_size_t buffer_size_;
+};
+
+
+
+
+/** @brief Special host array type for conversion between OpenCL types and pure CPU types */
+template<typename T>
+class typesafe_host_array<T, true>
+{
+  typedef T                                              cpu_type;
+  typedef typename detail::convert_to_opencl<T>::type    target_type;
+
+public:
+  explicit typesafe_host_array() : convert_to_opencl_( (default_memory_type() == OPENCL_MEMORY) ? true : false), bytes_buffer_(NULL), buffer_size_(0) {}
+
+  explicit typesafe_host_array(mem_handle const & handle, vcl_size_t num = 0) : convert_to_opencl_(false), bytes_buffer_(NULL), buffer_size_(sizeof(cpu_type) * num)
+  {
+    resize(handle, num);
+  }
+
+  ~typesafe_host_array() { delete[] bytes_buffer_; }
+
+  //
+  // Setter and Getter
+  //
+
+  template<typename U>
+  void set(vcl_size_t index, U value)
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    if (convert_to_opencl_)
+      reinterpret_cast<target_type *>(bytes_buffer_)[index] = static_cast<target_type>(value);
+    else
+#endif
+      reinterpret_cast<cpu_type *>(bytes_buffer_)[index] = static_cast<cpu_type>(value);
+  }
+
+  void * get() { return reinterpret_cast<void *>(bytes_buffer_); }
+  cpu_type operator[](vcl_size_t index) const
+  {
+    assert(index < size() && bool("index out of bounds"));
+#ifdef VIENNACL_WITH_OPENCL
+    if (convert_to_opencl_)
+      return static_cast<cpu_type>(reinterpret_cast<target_type *>(bytes_buffer_)[index]);
+#endif
+    return reinterpret_cast<cpu_type *>(bytes_buffer_)[index];
+  }
+
+  vcl_size_t raw_size() const { return buffer_size_; }
+  vcl_size_t element_size() const
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    if (convert_to_opencl_)
+      return sizeof(target_type);
+#endif
+    return sizeof(cpu_type);
+  }
+  vcl_size_t size() const { return buffer_size_ / element_size(); }
+
+  //
+  // Resize functionality
+  //
+
+  /** @brief Resize without initializing the new memory */
+  void raw_resize(mem_handle const & handle, vcl_size_t num)
+  {
+    buffer_size_ = sizeof(cpu_type) * num;
+    (void)handle; //silence unused variable warning if compiled without OpenCL support
+
+#ifdef VIENNACL_WITH_OPENCL
+    memory_types mem_type = handle.get_active_handle_id();
+    if (mem_type == MEMORY_NOT_INITIALIZED)
+      mem_type = default_memory_type();
+
+    if (mem_type == OPENCL_MEMORY)
+    {
+      convert_to_opencl_ = true;
+      buffer_size_ = sizeof(target_type) * num;
+    }
+#endif
+
+    if (num > 0)
+    {
+      delete[] bytes_buffer_;
+
+      bytes_buffer_ = new char[buffer_size_];
+    }
+  }
+
+  /** @brief Resize including initialization of new memory (cf. std::vector<>) */
+  void resize(mem_handle const & handle, vcl_size_t num)
+  {
+    raw_resize(handle, num);
+
+    if (num > 0)
+    {
+      for (vcl_size_t i=0; i<buffer_size_; ++i)
+        bytes_buffer_[i] = 0;
+    }
+  }
+
+private:
+  bool convert_to_opencl_;
+  char * bytes_buffer_;
+  vcl_size_t buffer_size_;
+};
+
+} //backend
+} //viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/circulant_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/circulant_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/circulant_matrix.hpp
new file mode 100644
index 0000000..1ee13d5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/circulant_matrix.hpp
@@ -0,0 +1,359 @@
+#ifndef VIENNACL_CIRCULANT_MATRIX_HPP
+#define VIENNACL_CIRCULANT_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file circulant_matrix.hpp
+    @brief Implementation of the circulant_matrix class for efficient manipulation of circulant matrices.  Experimental.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/ocl/backend.hpp"
+
+#include "viennacl/linalg/circulant_matrix_operations.hpp"
+
+#include "viennacl/fft.hpp"
+
+namespace viennacl
+{
+/** @brief A Circulant matrix class
+  *
+  * @tparam NumericT  The underlying scalar type (either float or double)
+  * @tparam AlignmentV   The internal memory size is given by (size()/AlignmentV + 1) * AlignmentV. AlignmentV must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+  */
+template<class NumericT, unsigned int AlignmentV>
+class circulant_matrix
+{
+public:
+  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
+
+  /**
+    * @brief The default constructor. Does not allocate any memory.
+    *
+    */
+  explicit circulant_matrix() {}
+
+  /**
+    * @brief         Creates the matrix with the given size
+    *
+    * @param rows      Number of rows of the matrix
+    * @param cols      Number of columns of the matrix
+    */
+  explicit circulant_matrix(vcl_size_t rows, vcl_size_t cols) : elements_(rows)
+  {
+    assert(rows == cols && bool("Circulant matrix must be square!"));
+    (void)cols;  // avoid 'unused parameter' warning in optimized builds
+  }
+
+  /** @brief Resizes the matrix.
+   *   Existing entries can be preserved
+   *
+   * @param sz         New size of matrix
+   * @param preserve   If true, existing values are preserved.
+   */
+  void resize(vcl_size_t sz, bool preserve = true)
+  {
+    elements_.resize(sz, preserve);
+  }
+
+  /** @brief Returns the OpenCL handle
+   *
+   *   @return OpenCL handle
+   */
+  handle_type const & handle() const { return elements_.handle(); }
+
+  /**
+    * @brief Returns an internal viennacl::vector, which represents a circulant matrix elements
+    *
+    */
+  viennacl::vector<NumericT, AlignmentV> & elements() { return elements_; }
+  viennacl::vector<NumericT, AlignmentV> const & elements() const { return elements_; }
+
+  /**
+    * @brief Returns the number of rows of the matrix
+    */
+  vcl_size_t size1() const { return elements_.size(); }
+
+  /**
+    * @brief Returns the number of columns of the matrix
+    */
+  vcl_size_t size2() const { return elements_.size(); }
+
+  /** @brief Returns the internal size of matrix representtion.
+   *   Usually required for launching OpenCL kernels only
+   *
+   *   @return Internal size of matrix representation
+   */
+  vcl_size_t internal_size() const { return elements_.internal_size(); }
+
+  /**
+    * @brief Read-write access to a single element of the matrix
+    *
+    * @param row_index  Row index of accessed element
+    * @param col_index  Column index of accessed element
+    * @return Proxy for matrix entry
+    */
+  entry_proxy<NumericT> operator()(vcl_size_t row_index, vcl_size_t col_index)
+  {
+    long index = static_cast<long>(row_index) - static_cast<long>(col_index);
+
+    assert(row_index < size1() && col_index < size2() && bool("Invalid access"));
+
+    while (index < 0)
+      index += static_cast<long>(size1());
+    return elements_[static_cast<vcl_size_t>(index)];
+  }
+
+  /**
+    * @brief += operation for circulant matrices
+    *
+    * @param that Matrix which will be added
+    * @return Result of addition
+    */
+  circulant_matrix<NumericT, AlignmentV>& operator +=(circulant_matrix<NumericT, AlignmentV>& that)
+  {
+    elements_ += that.elements();
+    return *this;
+  }
+
+private:
+  circulant_matrix(circulant_matrix const &) {}
+  circulant_matrix & operator=(circulant_matrix const & t);
+
+  viennacl::vector<NumericT, AlignmentV> elements_;
+};
+
+/** @brief Copies a circulant matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU)
+  *
+  *
+  * @param cpu_vec   A std::vector on the host.
+  * @param gpu_mat   A circulant_matrix from ViennaCL
+  */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(std::vector<NumericT>& cpu_vec, circulant_matrix<NumericT, AlignmentV>& gpu_mat)
+{
+  assert( (gpu_mat.size1() == 0 || cpu_vec.size() == gpu_mat.size1()) && bool("Size mismatch"));
+  copy(cpu_vec, gpu_mat.elements());
+}
+
+/** @brief Copies a circulant matrix from the OpenCL device (either GPU or multi-core CPU) to the std::vector
+  *
+  *
+  * @param gpu_mat   A circulant_matrix from ViennaCL
+  * @param cpu_vec   A std::vector on the host.
+  */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(circulant_matrix<NumericT, AlignmentV>& gpu_mat, std::vector<NumericT>& cpu_vec)
+{
+  assert(cpu_vec.size() == gpu_mat.size1() && bool("Size mismatch"));
+  copy(gpu_mat.elements(), cpu_vec);
+}
+
+/** @brief Copies a circulant matrix from the OpenCL device (either GPU or multi-core CPU) to the matrix-like object
+  *
+  *
+  * @param circ_src   A circulant_matrix from ViennaCL
+  * @param com_dst   A matrix-like object
+  */
+template<typename NumericT, unsigned int AlignmentV, typename MatrixT>
+void copy(circulant_matrix<NumericT, AlignmentV>& circ_src, MatrixT& com_dst)
+{
+  vcl_size_t size = circ_src.size1();
+  assert(size == viennacl::traits::size1(com_dst) && bool("Size mismatch"));
+  assert(size == viennacl::traits::size2(com_dst) && bool("Size mismatch"));
+  std::vector<NumericT> tmp(size);
+  copy(circ_src, tmp);
+
+  for (vcl_size_t i = 0; i < size; i++)
+  {
+    for (vcl_size_t j = 0; j < size; j++)
+    {
+      long index = static_cast<long>(i) - static_cast<long>(j);
+      if (index < 0)
+        index += static_cast<long>(size);
+      com_dst(i, j) = tmp[static_cast<vcl_size_t>(index)];
+    }
+  }
+}
+
+/** @brief Copies a the matrix-like object to the circulant matrix from the OpenCL device (either GPU or multi-core CPU)
+  *
+  *
+  * @param com_src   A std::vector on the host
+  * @param circ_dst   A circulant_matrix from ViennaCL
+  */
+template<typename NumericT, unsigned int AlignmentV, typename MatrixT>
+void copy(MatrixT& com_src, circulant_matrix<NumericT, AlignmentV>& circ_dst)
+{
+  assert( (circ_dst.size1() == 0 || circ_dst.size1() == viennacl::traits::size1(com_src)) && bool("Size mismatch"));
+  assert( (circ_dst.size2() == 0 || circ_dst.size2() == viennacl::traits::size2(com_src)) && bool("Size mismatch"));
+
+  vcl_size_t size = viennacl::traits::size1(com_src);
+
+  std::vector<NumericT> tmp(size);
+
+  for (vcl_size_t i = 0; i < size; i++) tmp[i] = com_src(i, 0);
+
+  copy(tmp, circ_dst);
+}
+
+/*namespace linalg
+  {
+    template<typename NumericT, unsigned int AlignmentV, unsigned int VECTOR_AlignmentV>
+    void prod_impl(circulant_matrix<NumericT, AlignmentV> const & mat,
+                    vector<NumericT, VECTOR_AlignmentV> const & vec,
+                    vector<NumericT, VECTOR_AlignmentV>& result) {
+        viennacl::vector<NumericT, VECTOR_AlignmentV> circ(mat.elements().size() * 2);
+        fft::real_to_complex(mat.elements(), circ, mat.elements().size());
+
+        viennacl::vector<NumericT, VECTOR_AlignmentV> tmp(vec.size() * 2);
+        viennacl::vector<NumericT, VECTOR_AlignmentV> tmp2(vec.size() * 2);
+
+        fft::real_to_complex(vec, tmp, vec.size());
+        fft::convolve(circ, tmp, tmp2);
+        fft::complex_to_real(tmp2, result, vec.size());
+    }
+  }*/
+
+/** @brief Prints the matrix. Output is compatible to boost::numeric::ublas
+  *
+  * @param s            STL output stream
+  * @param gpu_matrix   A ViennaCL circulant matrix
+  */
+template<class NumericT, unsigned int AlignmentV>
+std::ostream & operator<<(std::ostream& s, circulant_matrix<NumericT, AlignmentV>& gpu_matrix)
+{
+  vcl_size_t size = gpu_matrix.size1();
+  std::vector<NumericT> tmp(size);
+  copy(gpu_matrix, tmp);
+  s << "[" << size << "," << size << "](";
+
+  for (vcl_size_t i = 0; i < size; i++)
+  {
+    s << "(";
+    for (vcl_size_t j = 0; j < size; j++)
+    {
+      long index = static_cast<long>(i) - static_cast<long>(j);
+      if (index < 0) index = static_cast<long>(size) + index;
+      s << tmp[vcl_size_t(index)];
+      //s << index;
+      if (j < (size - 1)) s << ",";
+    }
+    s << ")";
+  }
+  s << ")";
+  return s;
+}
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+  // x = A * y
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x = A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+        lhs = temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+      lhs += temp;
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+      lhs -= temp;
+    }
+  };
+
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs());
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const circulant_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs());
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs += temp_result;
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs());
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs -= temp_result;
+    }
+  };
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+
+#endif // VIENNACL_CIRCULANT_MATRIX_HPP


[11/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp
new file mode 100644
index 0000000..08e15a5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp
@@ -0,0 +1,110 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_COMPRESSED_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_COMPRESSED_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp
+ *  @brief OpenCL kernel file for vector operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_vec_mul(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void vec_mul( \n");
+  source.append("  __global const unsigned int * row_jumper, \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  uint nonzero_rows, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("  uint4 layout_x, \n");
+  source.append("  "); source.append(numeric_string); source.append(" alpha, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  uint4 layout_result, \n");
+  source.append("  "); source.append(numeric_string); source.append(" beta) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < nonzero_rows; i += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+  source.append("    unsigned int row_end = row_jumper[i+1]; \n");
+  source.append("    for (unsigned int j = row_jumper[i]; j < row_end; ++j) \n");
+  source.append("      dot_prod += elements[j] * x[column_indices[j] * layout_x.y + layout_x.x]; \n");
+
+  source.append("    if (beta != 0) result[row_indices[i] * layout_result.y + layout_result.x] += alpha * dot_prod; \n");
+  source.append("    else           result[row_indices[i] * layout_result.y + layout_result.x]  = alpha * dot_prod; \n");
+  source.append("  } \n");
+  source.append(" } \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+/** @brief Main kernel class for generating OpenCL kernels for compressed_compressed_matrix. */
+template<typename NumericT>
+struct compressed_compressed_matrix
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_compressed_compressed_matrix";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(8192);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // fully parametrized kernels:
+      generate_vec_mul(source, numeric_string);
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+


[18/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/sparse_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/sparse_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/sparse_matrix_operations.hpp
new file mode 100644
index 0000000..9cc5d67
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/sparse_matrix_operations.hpp
@@ -0,0 +1,2081 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_SPARSE_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_SPARSE_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/sparse_matrix_operations.hpp
+    @brief Implementations of operations using sparse matrices on the CPU using a single thread or OpenMP.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/host_based/vector_operations.hpp"
+
+#include "viennacl/linalg/host_based/spgemm_vector.hpp"
+
+#include <vector>
+
+#ifdef VIENNACL_WITH_OPENMP
+#include <omp.h>
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+//
+// Compressed matrix
+//
+
+namespace detail
+{
+  template<typename NumericT, unsigned int AlignmentV>
+  void row_info(compressed_matrix<NumericT, AlignmentV> const & mat,
+                vector_base<NumericT> & vec,
+                viennacl::linalg::detail::row_info_types info_selector)
+  {
+    NumericT         * result_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+    NumericT   const * elements   = detail::extract_raw_pointer<NumericT>(mat.handle());
+    unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());
+    unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+
+    for (vcl_size_t row = 0; row < mat.size1(); ++row)
+    {
+      NumericT value = 0;
+      unsigned int row_end = row_buffer[row+1];
+
+      switch (info_selector)
+      {
+        case viennacl::linalg::detail::SPARSE_ROW_NORM_INF: //inf-norm
+          for (unsigned int i = row_buffer[row]; i < row_end; ++i)
+            value = std::max<NumericT>(value, std::fabs(elements[i]));
+          break;
+
+        case viennacl::linalg::detail::SPARSE_ROW_NORM_1: //1-norm
+          for (unsigned int i = row_buffer[row]; i < row_end; ++i)
+            value += std::fabs(elements[i]);
+          break;
+
+        case viennacl::linalg::detail::SPARSE_ROW_NORM_2: //2-norm
+          for (unsigned int i = row_buffer[row]; i < row_end; ++i)
+            value += elements[i] * elements[i];
+          value = std::sqrt(value);
+          break;
+
+        case viennacl::linalg::detail::SPARSE_ROW_DIAGONAL: //diagonal entry
+          for (unsigned int i = row_buffer[row]; i < row_end; ++i)
+          {
+            if (col_buffer[i] == row)
+            {
+              value = elements[i];
+              break;
+            }
+          }
+          break;
+      }
+      result_buf[row] = value;
+    }
+  }
+}
+
+
+/** @brief Carries out matrix-vector multiplication with a compressed_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & mat,
+               const viennacl::vector_base<NumericT> & vec,
+               NumericT alpha,
+               viennacl::vector_base<NumericT> & result,
+               NumericT beta)
+{
+  NumericT           * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
+  NumericT     const * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(mat.handle());
+  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());
+  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long row = 0; row < static_cast<long>(mat.size1()); ++row)
+  {
+    NumericT dot_prod = 0;
+    vcl_size_t row_end = row_buffer[row+1];
+    for (vcl_size_t i = row_buffer[row]; i < row_end; ++i)
+      dot_prod += elements[i] * vec_buf[col_buffer[i] * vec.stride() + vec.start()];
+
+    if (beta < 0 || beta > 0)
+    {
+      vcl_size_t index = static_cast<vcl_size_t>(row) * result.stride() + result.start();
+      result_buf[index] = alpha * dot_prod + beta * result_buf[index];
+    }
+    else
+      result_buf[static_cast<vcl_size_t>(row) * result.stride() + result.start()] = alpha * dot_prod;
+  }
+
+}
+
+/** @brief Carries out sparse_matrix-matrix multiplication first matrix being compressed
+*
+* Implementation of the convenience expression result = prod(sp_mat, d_mat);
+*
+* @param sp_mat     The sparse matrix
+* @param d_mat      The dense matrix
+* @param result     The result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_mat,
+               const viennacl::matrix_base<NumericT> & d_mat,
+                     viennacl::matrix_base<NumericT> & result) {
+
+  NumericT     const * sp_mat_elements   = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
+  unsigned int const * sp_mat_row_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle1());
+  unsigned int const * sp_mat_col_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
+
+  NumericT const * d_mat_data  = detail::extract_raw_pointer<NumericT>(d_mat);
+  NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
+  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
+  vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat);
+  vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat);
+  vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat);
+  vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat);
+
+  vcl_size_t result_start1 = viennacl::traits::start1(result);
+  vcl_size_t result_start2 = viennacl::traits::start2(result);
+  vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+  vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+  vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+  vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+  detail::matrix_array_wrapper<NumericT const, row_major, false>
+      d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+  detail::matrix_array_wrapper<NumericT const, column_major, false>
+      d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+  detail::matrix_array_wrapper<NumericT, row_major, false>
+      result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+  detail::matrix_array_wrapper<NumericT, column_major, false>
+      result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+  if ( d_mat.row_major() ) {
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
+      vcl_size_t row_start = sp_mat_row_buffer[row];
+      vcl_size_t row_end = sp_mat_row_buffer[row+1];
+      for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+        NumericT temp = 0;
+        for (vcl_size_t k = row_start; k < row_end; ++k) {
+          temp += sp_mat_elements[k] * d_mat_wrapper_row(static_cast<vcl_size_t>(sp_mat_col_buffer[k]), col);
+        }
+        if (result.row_major())
+          result_wrapper_row(row, col) = temp;
+        else
+          result_wrapper_col(row, col) = temp;
+      }
+    }
+  }
+  else {
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
+      for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
+        vcl_size_t row_start = sp_mat_row_buffer[row];
+        vcl_size_t row_end = sp_mat_row_buffer[row+1];
+        NumericT temp = 0;
+        for (vcl_size_t k = row_start; k < row_end; ++k) {
+          temp += sp_mat_elements[k] * d_mat_wrapper_col(static_cast<vcl_size_t>(sp_mat_col_buffer[k]), static_cast<vcl_size_t>(col));
+        }
+        if (result.row_major())
+          result_wrapper_row(row, col) = temp;
+        else
+          result_wrapper_col(row, col) = temp;
+      }
+    }
+  }
+
+}
+
+/** @brief Carries out matrix-trans(matrix) multiplication first matrix being compressed
+*          and the second transposed
+*
+* Implementation of the convenience expression result = prod(sp_mat, trans(d_mat));
+*
+* @param sp_mat             The sparse matrix
+* @param d_mat              The transposed dense matrix
+* @param result             The result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_mat,
+               const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+                                                  const viennacl::matrix_base<NumericT>,
+                                                  viennacl::op_trans > & d_mat,
+                viennacl::matrix_base<NumericT> & result) {
+
+  NumericT     const * sp_mat_elements   = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
+  unsigned int const * sp_mat_row_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle1());
+  unsigned int const * sp_mat_col_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
+
+  NumericT const *  d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
+  NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
+  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
+  vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat.lhs());
+  vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat.lhs());
+  vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat.lhs());
+  vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat.lhs());
+
+  vcl_size_t result_start1 = viennacl::traits::start1(result);
+  vcl_size_t result_start2 = viennacl::traits::start2(result);
+  vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+  vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+  vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+  vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+  detail::matrix_array_wrapper<NumericT const, row_major, false>
+      d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+  detail::matrix_array_wrapper<NumericT const, column_major, false>
+      d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+  detail::matrix_array_wrapper<NumericT, row_major, false>
+      result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+  detail::matrix_array_wrapper<NumericT, column_major, false>
+      result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+  if ( d_mat.lhs().row_major() ) {
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
+      vcl_size_t row_start = sp_mat_row_buffer[row];
+      vcl_size_t row_end = sp_mat_row_buffer[row+1];
+      for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+        NumericT temp = 0;
+        for (vcl_size_t k = row_start; k < row_end; ++k) {
+          temp += sp_mat_elements[k] * d_mat_wrapper_row(col, static_cast<vcl_size_t>(sp_mat_col_buffer[k]));
+        }
+        if (result.row_major())
+          result_wrapper_row(row, col) = temp;
+        else
+          result_wrapper_col(row, col) = temp;
+      }
+    }
+  }
+  else {
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
+      for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
+        vcl_size_t row_start = sp_mat_row_buffer[row];
+        vcl_size_t row_end = sp_mat_row_buffer[row+1];
+        NumericT temp = 0;
+        for (vcl_size_t k = row_start; k < row_end; ++k) {
+          temp += sp_mat_elements[k] * d_mat_wrapper_col(col, static_cast<vcl_size_t>(sp_mat_col_buffer[k]));
+        }
+        if (result.row_major())
+          result_wrapper_row(row, col) = temp;
+        else
+          result_wrapper_col(row, col) = temp;
+      }
+    }
+  }
+
+}
+
+
+/** @brief Carries out sparse_matrix-sparse_matrix multiplication for CSR matrices
+*
+* Implementation of the convenience expression C = prod(A, B);
+* Based on computing C(i, :) = A(i, :) * B via merging the respective rows of B
+*
+* @param A     Left factor
+* @param B     Right factor
+* @param C     Result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+               viennacl::compressed_matrix<NumericT, AlignmentV> const & B,
+               viennacl::compressed_matrix<NumericT, AlignmentV> & C)
+{
+
+  NumericT     const * A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  NumericT     const * B_elements   = detail::extract_raw_pointer<NumericT>(B.handle());
+  unsigned int const * B_row_buffer = detail::extract_raw_pointer<unsigned int>(B.handle1());
+  unsigned int const * B_col_buffer = detail::extract_raw_pointer<unsigned int>(B.handle2());
+
+  C.resize(A.size1(), B.size2(), false);
+  unsigned int * C_row_buffer = detail::extract_raw_pointer<unsigned int>(C.handle1());
+
+#if defined(VIENNACL_WITH_OPENMP)
+  unsigned int block_factor = 10;
+  unsigned int max_threads = omp_get_max_threads();
+  long chunk_size = long(A.size1()) / long(block_factor * max_threads) + 1;
+#else
+  unsigned int max_threads = 1;
+#endif
+  std::vector<unsigned int> max_length_row_C(max_threads);
+  std::vector<unsigned int *> row_C_temp_index_buffers(max_threads);
+  std::vector<NumericT *>     row_C_temp_value_buffers(max_threads);
+
+
+  /*
+   * Stage 1: Determine maximum length of work buffers:
+   */
+
+#if defined(VIENNACL_WITH_OPENMP)
+  #pragma omp parallel for schedule(dynamic, chunk_size)
+#endif
+  for (long i=0; i<long(A.size1()); ++i)
+  {
+    unsigned int row_start_A = A_row_buffer[i];
+    unsigned int row_end_A   = A_row_buffer[i+1];
+
+    unsigned int row_C_upper_bound_row = 0;
+    for (unsigned int j = row_start_A; j<row_end_A; ++j)
+    {
+      unsigned int row_B = A_col_buffer[j];
+
+      unsigned int entries_in_row = B_row_buffer[row_B+1] - B_row_buffer[row_B];
+      row_C_upper_bound_row += entries_in_row;
+    }
+
+#ifdef VIENNACL_WITH_OPENMP
+    unsigned int thread_id = omp_get_thread_num();
+#else
+    unsigned int thread_id = 0;
+#endif
+
+    max_length_row_C[thread_id] = std::max(max_length_row_C[thread_id], std::min(row_C_upper_bound_row, static_cast<unsigned int>(B.size2())));
+  }
+
+  // determine global maximum row length
+  for (std::size_t i=1; i<max_length_row_C.size(); ++i)
+    max_length_row_C[0] = std::max(max_length_row_C[0], max_length_row_C[i]);
+
+  // allocate work vectors:
+  for (unsigned int i=0; i<max_threads; ++i)
+    row_C_temp_index_buffers[i] = (unsigned int *)malloc(sizeof(unsigned int)*3*max_length_row_C[0]);
+
+
+  /*
+   * Stage 2: Determine sparsity pattern of C
+   */
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for schedule(dynamic, chunk_size)
+#endif
+  for (long i=0; i<long(A.size1()); ++i)
+  {
+    unsigned int thread_id = 0;
+  #ifdef VIENNACL_WITH_OPENMP
+    thread_id = omp_get_thread_num();
+  #endif
+    unsigned int buffer_len = max_length_row_C[0];
+
+    unsigned int *row_C_vector_1 = row_C_temp_index_buffers[thread_id];
+    unsigned int *row_C_vector_2 = row_C_vector_1 + buffer_len;
+    unsigned int *row_C_vector_3 = row_C_vector_2 + buffer_len;
+
+    unsigned int row_start_A = A_row_buffer[i];
+    unsigned int row_end_A   = A_row_buffer[i+1];
+
+    C_row_buffer[i] = row_C_scan_symbolic_vector(row_start_A, row_end_A, A_col_buffer,
+                                                 B_row_buffer, B_col_buffer, static_cast<unsigned int>(B.size2()),
+                                                 row_C_vector_1, row_C_vector_2, row_C_vector_3);
+  }
+
+  // exclusive scan to obtain row start indices:
+  unsigned int current_offset = 0;
+  for (std::size_t i=0; i<C.size1(); ++i)
+  {
+    unsigned int tmp = C_row_buffer[i];
+    C_row_buffer[i] = current_offset;
+    current_offset += tmp;
+  }
+  C_row_buffer[C.size1()] = current_offset;
+  C.reserve(current_offset, false);
+
+  // allocate work vectors:
+  for (unsigned int i=0; i<max_threads; ++i)
+    row_C_temp_value_buffers[i] = (NumericT *)malloc(sizeof(NumericT)*3*max_length_row_C[0]);
+
+  /*
+   * Stage 3: Compute product (code similar, maybe pull out into a separate function to avoid code duplication?)
+   */
+  NumericT     * C_elements   = detail::extract_raw_pointer<NumericT>(C.handle());
+  unsigned int * C_col_buffer = detail::extract_raw_pointer<unsigned int>(C.handle2());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for schedule(dynamic, chunk_size)
+#endif
+  for (long i = 0; i < long(A.size1()); ++i)
+  {
+    unsigned int row_start_A  = A_row_buffer[i];
+    unsigned int row_end_A    = A_row_buffer[i+1];
+
+    unsigned int row_C_buffer_start = C_row_buffer[i];
+    unsigned int row_C_buffer_end   = C_row_buffer[i+1];
+
+#ifdef VIENNACL_WITH_OPENMP
+    unsigned int thread_id = omp_get_thread_num();
+#else
+    unsigned int thread_id = 0;
+#endif
+
+    unsigned int *row_C_vector_1 = row_C_temp_index_buffers[thread_id];
+    unsigned int *row_C_vector_2 = row_C_vector_1 + max_length_row_C[0];
+    unsigned int *row_C_vector_3 = row_C_vector_2 + max_length_row_C[0];
+
+    NumericT *row_C_vector_1_values = row_C_temp_value_buffers[thread_id];
+    NumericT *row_C_vector_2_values = row_C_vector_1_values + max_length_row_C[0];
+    NumericT *row_C_vector_3_values = row_C_vector_2_values + max_length_row_C[0];
+
+    row_C_scan_numeric_vector(row_start_A, row_end_A, A_col_buffer, A_elements,
+                              B_row_buffer, B_col_buffer, B_elements, static_cast<unsigned int>(B.size2()),
+                              row_C_buffer_start, row_C_buffer_end, C_col_buffer, C_elements,
+                              row_C_vector_1, row_C_vector_1_values,
+                              row_C_vector_2, row_C_vector_2_values,
+                              row_C_vector_3, row_C_vector_3_values);
+  }
+
+  // clean up at the end:
+  for (unsigned int i=0; i<max_threads; ++i)
+  {
+    free(row_C_temp_index_buffers[i]);
+    free(row_C_temp_value_buffers[i]);
+  }
+
+}
+
+
+
+
+//
+// Triangular solve for compressed_matrix, A \ b
+//
+namespace detail
+{
+  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+  void csr_inplace_solve(IndexArrayT const & row_buffer,
+                         IndexArrayT const & col_buffer,
+                         ConstScalarArrayT const & element_buffer,
+                         ScalarArrayT & vec_buffer,
+                         vcl_size_t num_cols,
+                         viennacl::linalg::unit_lower_tag)
+  {
+    vcl_size_t row_begin = row_buffer[1];
+    for (vcl_size_t row = 1; row < num_cols; ++row)
+    {
+      NumericT vec_entry = vec_buffer[row];
+      vcl_size_t row_end = row_buffer[row+1];
+      for (vcl_size_t i = row_begin; i < row_end; ++i)
+      {
+        vcl_size_t col_index = col_buffer[i];
+        if (col_index < row)
+          vec_entry -= vec_buffer[col_index] * element_buffer[i];
+      }
+      vec_buffer[row] = vec_entry;
+      row_begin = row_end;
+    }
+  }
+
+  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+  void csr_inplace_solve(IndexArrayT const & row_buffer,
+                         IndexArrayT const & col_buffer,
+                         ConstScalarArrayT const & element_buffer,
+                         ScalarArrayT & vec_buffer,
+                         vcl_size_t num_cols,
+                         viennacl::linalg::lower_tag)
+  {
+    vcl_size_t row_begin = row_buffer[0];
+    for (vcl_size_t row = 0; row < num_cols; ++row)
+    {
+      NumericT vec_entry = vec_buffer[row];
+
+      // substitute and remember diagonal entry
+      vcl_size_t row_end = row_buffer[row+1];
+      NumericT diagonal_entry = 0;
+      for (vcl_size_t i = row_begin; i < row_end; ++i)
+      {
+        vcl_size_t col_index = col_buffer[i];
+        if (col_index < row)
+          vec_entry -= vec_buffer[col_index] * element_buffer[i];
+        else if (col_index == row)
+          diagonal_entry = element_buffer[i];
+      }
+
+      vec_buffer[row] = vec_entry / diagonal_entry;
+      row_begin = row_end;
+    }
+  }
+
+
+  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+  void csr_inplace_solve(IndexArrayT const & row_buffer,
+                         IndexArrayT const & col_buffer,
+                         ConstScalarArrayT const & element_buffer,
+                         ScalarArrayT & vec_buffer,
+                         vcl_size_t num_cols,
+                         viennacl::linalg::unit_upper_tag)
+  {
+    for (vcl_size_t row2 = 1; row2 < num_cols; ++row2)
+    {
+      vcl_size_t row = (num_cols - row2) - 1;
+      NumericT vec_entry = vec_buffer[row];
+      vcl_size_t row_begin = row_buffer[row];
+      vcl_size_t row_end   = row_buffer[row+1];
+      for (vcl_size_t i = row_begin; i < row_end; ++i)
+      {
+        vcl_size_t col_index = col_buffer[i];
+        if (col_index > row)
+          vec_entry -= vec_buffer[col_index] * element_buffer[i];
+      }
+      vec_buffer[row] = vec_entry;
+    }
+  }
+
+  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+  void csr_inplace_solve(IndexArrayT const & row_buffer,
+                         IndexArrayT const & col_buffer,
+                         ConstScalarArrayT const & element_buffer,
+                         ScalarArrayT & vec_buffer,
+                         vcl_size_t num_cols,
+                         viennacl::linalg::upper_tag)
+  {
+    for (vcl_size_t row2 = 0; row2 < num_cols; ++row2)
+    {
+      vcl_size_t row = (num_cols - row2) - 1;
+      NumericT vec_entry = vec_buffer[row];
+
+      // substitute and remember diagonal entry
+      vcl_size_t row_begin = row_buffer[row];
+      vcl_size_t row_end   = row_buffer[row+1];
+      NumericT diagonal_entry = 0;
+      for (vcl_size_t i = row_begin; i < row_end; ++i)
+      {
+        vcl_size_t col_index = col_buffer[i];
+        if (col_index > row)
+          vec_entry -= vec_buffer[col_index] * element_buffer[i];
+        else if (col_index == row)
+          diagonal_entry = element_buffer[i];
+      }
+
+      vec_buffer[row] = vec_entry / diagonal_entry;
+    }
+  }
+
+} //namespace detail
+
+
+
+/** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param L    The matrix
+* @param vec  The vector holding the right hand side. Is overwritten by the solution.
+* @param tag  The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & L,
+                   vector_base<NumericT> & vec,
+                   viennacl::linalg::unit_lower_tag tag)
+{
+  NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+
+  detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, L.size2(), tag);
+}
+
+/** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param L    The matrix
+* @param vec  The vector holding the right hand side. Is overwritten by the solution.
+* @param tag  The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & L,
+                   vector_base<NumericT> & vec,
+                   viennacl::linalg::lower_tag tag)
+{
+  NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+
+  detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, L.size2(), tag);
+}
+
+
+/** @brief Inplace solution of a upper triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param U    The matrix
+* @param vec  The vector holding the right hand side. Is overwritten by the solution.
+* @param tag  The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & U,
+                   vector_base<NumericT> & vec,
+                   viennacl::linalg::unit_upper_tag tag)
+{
+  NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(U.handle());
+  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
+  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
+
+  detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, U.size2(), tag);
+}
+
+/** @brief Inplace solution of a upper triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param U    The matrix
+* @param vec  The vector holding the right hand side. Is overwritten by the solution.
+* @param tag  The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & U,
+                   vector_base<NumericT> & vec,
+                   viennacl::linalg::upper_tag tag)
+{
+  NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(U.handle());
+  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
+  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
+
+  detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, U.size2(), tag);
+}
+
+
+
+
+
+
+
+//
+// Triangular solve for compressed_matrix, A^T \ b
+//
+
+namespace detail
+{
+  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+  void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
+                               IndexArrayT const & col_buffer,
+                               ConstScalarArrayT const & element_buffer,
+                               ScalarArrayT & vec_buffer,
+                               vcl_size_t num_cols,
+                               viennacl::linalg::unit_lower_tag)
+  {
+    vcl_size_t col_begin = row_buffer[0];
+    for (vcl_size_t col = 0; col < num_cols; ++col)
+    {
+      NumericT vec_entry = vec_buffer[col];
+      vcl_size_t col_end = row_buffer[col+1];
+      for (vcl_size_t i = col_begin; i < col_end; ++i)
+      {
+        unsigned int row_index = col_buffer[i];
+        if (row_index > col)
+          vec_buffer[row_index] -= vec_entry * element_buffer[i];
+      }
+      col_begin = col_end;
+    }
+  }
+
+  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+  void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
+                               IndexArrayT const & col_buffer,
+                               ConstScalarArrayT const & element_buffer,
+                               ScalarArrayT & vec_buffer,
+                               vcl_size_t num_cols,
+                               viennacl::linalg::lower_tag)
+  {
+    vcl_size_t col_begin = row_buffer[0];
+    for (vcl_size_t col = 0; col < num_cols; ++col)
+    {
+      vcl_size_t col_end = row_buffer[col+1];
+
+      // Stage 1: Find diagonal entry:
+      NumericT diagonal_entry = 0;
+      for (vcl_size_t i = col_begin; i < col_end; ++i)
+      {
+        vcl_size_t row_index = col_buffer[i];
+        if (row_index == col)
+        {
+          diagonal_entry = element_buffer[i];
+          break;
+        }
+      }
+
+      // Stage 2: Substitute
+      NumericT vec_entry = vec_buffer[col] / diagonal_entry;
+      vec_buffer[col] = vec_entry;
+      for (vcl_size_t i = col_begin; i < col_end; ++i)
+      {
+        vcl_size_t row_index = col_buffer[i];
+        if (row_index > col)
+          vec_buffer[row_index] -= vec_entry * element_buffer[i];
+      }
+      col_begin = col_end;
+    }
+  }
+
+  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+  void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
+                               IndexArrayT const & col_buffer,
+                               ConstScalarArrayT const & element_buffer,
+                               ScalarArrayT & vec_buffer,
+                               vcl_size_t num_cols,
+                               viennacl::linalg::unit_upper_tag)
+  {
+    for (vcl_size_t col2 = 0; col2 < num_cols; ++col2)
+    {
+      vcl_size_t col = (num_cols - col2) - 1;
+
+      NumericT vec_entry = vec_buffer[col];
+      vcl_size_t col_begin = row_buffer[col];
+      vcl_size_t col_end = row_buffer[col+1];
+      for (vcl_size_t i = col_begin; i < col_end; ++i)
+      {
+        vcl_size_t row_index = col_buffer[i];
+        if (row_index < col)
+          vec_buffer[row_index] -= vec_entry * element_buffer[i];
+      }
+
+    }
+  }
+
+  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+  void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
+                               IndexArrayT const & col_buffer,
+                               ConstScalarArrayT const & element_buffer,
+                               ScalarArrayT & vec_buffer,
+                               vcl_size_t num_cols,
+                               viennacl::linalg::upper_tag)
+  {
+    for (vcl_size_t col2 = 0; col2 < num_cols; ++col2)
+    {
+      vcl_size_t col = (num_cols - col2) - 1;
+      vcl_size_t col_begin = row_buffer[col];
+      vcl_size_t col_end = row_buffer[col+1];
+
+      // Stage 1: Find diagonal entry:
+      NumericT diagonal_entry = 0;
+      for (vcl_size_t i = col_begin; i < col_end; ++i)
+      {
+        vcl_size_t row_index = col_buffer[i];
+        if (row_index == col)
+        {
+          diagonal_entry = element_buffer[i];
+          break;
+        }
+      }
+
+      // Stage 2: Substitute
+      NumericT vec_entry = vec_buffer[col] / diagonal_entry;
+      vec_buffer[col] = vec_entry;
+      for (vcl_size_t i = col_begin; i < col_end; ++i)
+      {
+        vcl_size_t row_index = col_buffer[i];
+        if (row_index < col)
+          vec_buffer[row_index] -= vec_entry * element_buffer[i];
+      }
+    }
+  }
+
+
+  //
+  // block solves
+  //
+  template<typename NumericT, unsigned int AlignmentV>
+  void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+                                                   const compressed_matrix<NumericT, AlignmentV>,
+                                                   op_trans> & L,
+                           viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,
+                           vector_base<NumericT> const & /* L_diagonal */,  //ignored
+                           vector_base<NumericT> & vec,
+                           viennacl::linalg::unit_lower_tag)
+  {
+    // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
+
+    unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle1());
+    unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle2());
+    NumericT     const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(L.lhs().handle());
+    NumericT           * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
+
+    vcl_size_t col_begin = row_buffer[0];
+    for (vcl_size_t col = 0; col < L.lhs().size1(); ++col)
+    {
+      NumericT vec_entry = vec_buffer[col];
+      vcl_size_t col_end = row_buffer[col+1];
+      for (vcl_size_t i = col_begin; i < col_end; ++i)
+      {
+        unsigned int row_index = col_buffer[i];
+        if (row_index > col)
+          vec_buffer[row_index] -= vec_entry * elements[i];
+      }
+      col_begin = col_end;
+    }
+  }
+
+  template<typename NumericT, unsigned int AlignmentV>
+  void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+                                                   const compressed_matrix<NumericT, AlignmentV>,
+                                                   op_trans> & L,
+                           viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,
+                           vector_base<NumericT> const & L_diagonal,
+                           vector_base<NumericT> & vec,
+                           viennacl::linalg::lower_tag)
+  {
+    // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
+
+    unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle1());
+    unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle2());
+    NumericT     const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(L.lhs().handle());
+    NumericT     const * diagonal_buffer = detail::extract_raw_pointer<NumericT>(L_diagonal.handle());
+    NumericT           * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
+
+    vcl_size_t col_begin = row_buffer[0];
+    for (vcl_size_t col = 0; col < L.lhs().size1(); ++col)
+    {
+      vcl_size_t col_end = row_buffer[col+1];
+
+      NumericT vec_entry = vec_buffer[col] / diagonal_buffer[col];
+      vec_buffer[col] = vec_entry;
+      for (vcl_size_t i = col_begin; i < col_end; ++i)
+      {
+        vcl_size_t row_index = col_buffer[i];
+        if (row_index > col)
+          vec_buffer[row_index] -= vec_entry * elements[i];
+      }
+      col_begin = col_end;
+    }
+  }
+
+
+
+  template<typename NumericT, unsigned int AlignmentV>
+  void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+                                                   const compressed_matrix<NumericT, AlignmentV>,
+                                                   op_trans> & U,
+                           viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,
+                           vector_base<NumericT> const & /* U_diagonal */, //ignored
+                           vector_base<NumericT> & vec,
+                           viennacl::linalg::unit_upper_tag)
+  {
+    // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
+
+    unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle1());
+    unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle2());
+    NumericT     const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(U.lhs().handle());
+    NumericT           * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
+
+    for (vcl_size_t col2 = 0; col2 < U.lhs().size1(); ++col2)
+    {
+      vcl_size_t col = (U.lhs().size1() - col2) - 1;
+
+      NumericT vec_entry = vec_buffer[col];
+      vcl_size_t col_begin = row_buffer[col];
+      vcl_size_t col_end = row_buffer[col+1];
+      for (vcl_size_t i = col_begin; i < col_end; ++i)
+      {
+        vcl_size_t row_index = col_buffer[i];
+        if (row_index < col)
+          vec_buffer[row_index] -= vec_entry * elements[i];
+      }
+
+    }
+  }
+
+  template<typename NumericT, unsigned int AlignmentV>
+  void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+                                                   const compressed_matrix<NumericT, AlignmentV>,
+                                                   op_trans> & U,
+                           viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,
+                           vector_base<NumericT> const & U_diagonal,
+                           vector_base<NumericT> & vec,
+                           viennacl::linalg::upper_tag)
+  {
+    // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
+
+    unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle1());
+    unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle2());
+    NumericT     const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(U.lhs().handle());
+    NumericT     const * diagonal_buffer = detail::extract_raw_pointer<NumericT>(U_diagonal.handle());
+    NumericT           * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
+
+    for (vcl_size_t col2 = 0; col2 < U.lhs().size1(); ++col2)
+    {
+      vcl_size_t col = (U.lhs().size1() - col2) - 1;
+      vcl_size_t col_begin = row_buffer[col];
+      vcl_size_t col_end = row_buffer[col+1];
+
+      // Stage 2: Substitute
+      NumericT vec_entry = vec_buffer[col] / diagonal_buffer[col];
+      vec_buffer[col] = vec_entry;
+      for (vcl_size_t i = col_begin; i < col_end; ++i)
+      {
+        vcl_size_t row_index = col_buffer[i];
+        if (row_index < col)
+          vec_buffer[row_index] -= vec_entry * elements[i];
+      }
+    }
+  }
+
+
+} //namespace detail
+
+/** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param proxy  Proxy object for a transposed CSR-matrix
+* @param vec    The right hand side vector
+* @param tag    The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+                                      const compressed_matrix<NumericT, AlignmentV>,
+                                      op_trans> const & proxy,
+                   vector_base<NumericT> & vec,
+                   viennacl::linalg::unit_lower_tag tag)
+{
+  NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
+  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
+  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
+
+  detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
+}
+
+/** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param proxy  Proxy object for a transposed CSR-matrix
+* @param vec    The right hand side vector
+* @param tag    The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+                                      const compressed_matrix<NumericT, AlignmentV>,
+                                      op_trans> const & proxy,
+                   vector_base<NumericT> & vec,
+                   viennacl::linalg::lower_tag tag)
+{
+  NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
+  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
+  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
+
+  detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
+}
+
+
+/** @brief Inplace solution of a upper triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param proxy  Proxy object for a transposed CSR-matrix
+* @param vec    The right hand side vector
+* @param tag    The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+                                      const compressed_matrix<NumericT, AlignmentV>,
+                                      op_trans> const & proxy,
+                   vector_base<NumericT> & vec,
+                   viennacl::linalg::unit_upper_tag tag)
+{
+  NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
+  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
+  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
+
+  detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
+}
+
+
+/** @brief Inplace solution of a upper triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param proxy  Proxy object for a transposed CSR-matrix
+* @param vec    The right hand side vector
+* @param tag    The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+                                      const compressed_matrix<NumericT, AlignmentV>,
+                                      op_trans> const & proxy,
+                   vector_base<NumericT> & vec,
+                   viennacl::linalg::upper_tag tag)
+{
+  NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
+  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
+  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
+
+  detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
+}
+
+
+
+//
+// Compressed Compressed Matrix
+//
+
+/** @brief Carries out matrix-vector multiplication with a compressed_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template<typename NumericT>
+void prod_impl(const viennacl::compressed_compressed_matrix<NumericT> & mat,
+               const viennacl::vector_base<NumericT> & vec,
+               NumericT alpha,
+                     viennacl::vector_base<NumericT> & result,
+               NumericT beta)
+{
+  NumericT           * result_buf  = detail::extract_raw_pointer<NumericT>(result.handle());
+  NumericT     const * vec_buf     = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT     const * elements    = detail::extract_raw_pointer<NumericT>(mat.handle());
+  unsigned int const * row_buffer  = detail::extract_raw_pointer<unsigned int>(mat.handle1());
+  unsigned int const * row_indices = detail::extract_raw_pointer<unsigned int>(mat.handle3());
+  unsigned int const * col_buffer  = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+
+  if (beta < 0 || beta > 0)
+  {
+    for (vcl_size_t i = 0; i< result.size(); ++i)
+      result_buf[i * result.stride() + result.start()] *= beta;
+  }
+  else // flush
+  {
+    for (vcl_size_t i = 0; i< result.size(); ++i)
+      result_buf[i * result.stride() + result.start()] = 0;
+  }
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i = 0; i < static_cast<long>(mat.nnz1()); ++i)
+  {
+    NumericT dot_prod = 0;
+    vcl_size_t row_end = row_buffer[i+1];
+    for (vcl_size_t j = row_buffer[i]; j < row_end; ++j)
+      dot_prod += elements[j] * vec_buf[col_buffer[j] * vec.stride() + vec.start()];
+
+    if (beta > 0 || beta < 0)
+      result_buf[vcl_size_t(row_indices[i]) * result.stride() + result.start()] += alpha * dot_prod;
+    else
+      result_buf[vcl_size_t(row_indices[i]) * result.stride() + result.start()]  = alpha * dot_prod;
+  }
+
+}
+
+
+
+//
+// Coordinate Matrix
+//
+
+namespace detail
+{
+  template<typename NumericT, unsigned int AlignmentV>
+  void row_info(coordinate_matrix<NumericT, AlignmentV> const & mat,
+                vector_base<NumericT> & vec,
+                viennacl::linalg::detail::row_info_types info_selector)
+  {
+    NumericT           * result_buf   = detail::extract_raw_pointer<NumericT>(vec.handle());
+    NumericT     const * elements     = detail::extract_raw_pointer<NumericT>(mat.handle());
+    unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle12());
+
+    NumericT value = 0;
+    unsigned int last_row = 0;
+
+    for (vcl_size_t i = 0; i < mat.nnz(); ++i)
+    {
+      unsigned int current_row = coord_buffer[2*i];
+
+      if (current_row != last_row)
+      {
+        if (info_selector == viennacl::linalg::detail::SPARSE_ROW_NORM_2)
+          value = std::sqrt(value);
+
+        result_buf[last_row] = value;
+        value = 0;
+        last_row = current_row;
+      }
+
+      switch (info_selector)
+      {
+        case viennacl::linalg::detail::SPARSE_ROW_NORM_INF: //inf-norm
+          value = std::max<NumericT>(value, std::fabs(elements[i]));
+          break;
+
+        case viennacl::linalg::detail::SPARSE_ROW_NORM_1: //1-norm
+          value += std::fabs(elements[i]);
+          break;
+
+        case viennacl::linalg::detail::SPARSE_ROW_NORM_2: //2-norm
+          value += elements[i] * elements[i];
+          break;
+
+        case viennacl::linalg::detail::SPARSE_ROW_DIAGONAL: //diagonal entry
+          if (coord_buffer[2*i+1] == current_row)
+            value = elements[i];
+          break;
+
+        //default:
+        //  break;
+      }
+    }
+
+    if (info_selector == viennacl::linalg::detail::SPARSE_ROW_NORM_2)
+      value = std::sqrt(value);
+
+    result_buf[last_row] = value;
+  }
+}
+
+/** @brief Carries out matrix-vector multiplication with a coordinate_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & mat,
+               const viennacl::vector_base<NumericT> & vec,
+               NumericT alpha,
+                     viennacl::vector_base<NumericT> & result,
+               NumericT beta)
+{
+  NumericT           * result_buf   = detail::extract_raw_pointer<NumericT>(result.handle());
+  NumericT     const * vec_buf      = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT     const * elements     = detail::extract_raw_pointer<NumericT>(mat.handle());
+  unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle12());
+
+  if (beta < 0 || beta > 0)
+  {
+    for (vcl_size_t i = 0; i< result.size(); ++i)
+      result_buf[i * result.stride() + result.start()] *= beta;
+  }
+  else // flush
+  {
+    for (vcl_size_t i = 0; i< result.size(); ++i)
+      result_buf[i * result.stride() + result.start()] = 0;
+  }
+
+  for (vcl_size_t i = 0; i < mat.nnz(); ++i)
+    result_buf[coord_buffer[2*i] * result.stride() + result.start()]
+      += alpha * elements[i] * vec_buf[coord_buffer[2*i+1] * vec.stride() + vec.start()];
+}
+
+/** @brief Carries out Compressed Matrix(COO)-Dense Matrix multiplication
+*
+* Implementation of the convenience expression result = prod(sp_mat, d_mat);
+*
+* @param sp_mat     The Sparse Matrix (Coordinate format)
+* @param d_mat      The Dense Matrix
+* @param result     The Result Matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & sp_mat,
+               const viennacl::matrix_base<NumericT> & d_mat,
+                     viennacl::matrix_base<NumericT> & result) {
+
+  NumericT     const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
+  unsigned int const * sp_mat_coords   = detail::extract_raw_pointer<unsigned int>(sp_mat.handle12());
+
+  NumericT const * d_mat_data  = detail::extract_raw_pointer<NumericT>(d_mat);
+  NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
+  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
+  vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat);
+  vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat);
+  vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat);
+  vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat);
+
+  vcl_size_t result_start1 = viennacl::traits::start1(result);
+  vcl_size_t result_start2 = viennacl::traits::start2(result);
+  vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+  vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+  vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+  vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+  detail::matrix_array_wrapper<NumericT const, row_major, false>
+      d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+  detail::matrix_array_wrapper<NumericT const, column_major, false>
+      d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+  detail::matrix_array_wrapper<NumericT, row_major, false>
+      result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+  detail::matrix_array_wrapper<NumericT, column_major, false>
+      result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+  if ( d_mat.row_major() ) {
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+    {
+      if (result.row_major())
+        for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+          result_wrapper_row(row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+      else
+        for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+          result_wrapper_col(row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+    }
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
+      NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
+      vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
+      vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
+      for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+        NumericT y = d_mat_wrapper_row( c, col);
+        if (result.row_major())
+          result_wrapper_row(r, col) += x * y;
+        else
+          result_wrapper_col(r, col) += x * y;
+      }
+    }
+  }
+
+  else {
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
+    {
+      if (result.row_major())
+        for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+          result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+      else
+        for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+          result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+    }
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
+
+      for (vcl_size_t i = 0; i < sp_mat.nnz(); ++i) {
+
+        NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
+        vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
+        vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
+        NumericT y = d_mat_wrapper_col( c, col);
+
+        if (result.row_major())
+          result_wrapper_row( r, col) += x*y;
+        else
+          result_wrapper_col( r, col) += x*y;
+      }
+
+    }
+  }
+
+}
+
+
+/** @brief Carries out Compressed Matrix(COO)-Dense Transposed Matrix multiplication
+*
+* Implementation of the convenience expression result = prod(sp_mat, trans(d_mat));
+*
+* @param sp_mat     The Sparse Matrix (Coordinate format)
+* @param d_mat      The Dense Transposed Matrix
+* @param result     The Result Matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & sp_mat,
+               const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+                                                  const viennacl::matrix_base<NumericT>,
+                                                  viennacl::op_trans > & d_mat,
+                     viennacl::matrix_base<NumericT> & result) {
+
+  NumericT     const * sp_mat_elements     = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
+  unsigned int const * sp_mat_coords       = detail::extract_raw_pointer<unsigned int>(sp_mat.handle12());
+
+  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
+  NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
+  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
+  vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat.lhs());
+  vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat.lhs());
+  vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat.lhs());
+  vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat.lhs());
+
+  vcl_size_t result_start1 = viennacl::traits::start1(result);
+  vcl_size_t result_start2 = viennacl::traits::start2(result);
+  vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+  vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+  vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+  vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+  detail::matrix_array_wrapper<NumericT const, row_major, false>
+      d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+  detail::matrix_array_wrapper<NumericT const, column_major, false>
+      d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+  detail::matrix_array_wrapper<NumericT, row_major, false>
+      result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+  detail::matrix_array_wrapper<NumericT, column_major, false>
+      result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+  if ( d_mat.lhs().row_major() )
+  {
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+    {
+      if (result.row_major())
+        for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+          result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+      else
+        for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+          result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+    }
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
+      NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
+      vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
+      vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
+      if (result.row_major())
+      {
+        for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+          NumericT y = d_mat_wrapper_row( col, c);
+          result_wrapper_row(r, col) += x * y;
+        }
+      }
+      else
+      {
+        for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+          NumericT y = d_mat_wrapper_row( col, c);
+          result_wrapper_col(r, col) += x * y;
+        }
+      }
+    }
+
+
+  }
+  else
+  {
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
+    {
+      if (result.row_major())
+        for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+          result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+      else
+        for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+          result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+    }
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
+      NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
+      vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
+      vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
+      if (result.row_major())
+      {
+        for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+          NumericT y = d_mat_wrapper_col( col, c);
+          result_wrapper_row(r, col) += x * y;
+        }
+      }
+      else
+      {
+        for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+          NumericT y = d_mat_wrapper_col( col, c);
+          result_wrapper_col(r, col) += x * y;
+        }
+      }
+    }
+  }
+
+}
+
+
+
+//
+// ELL Matrix
+//
+/** @brief Carries out matrix-vector multiplication with a ell_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::ell_matrix<NumericT, AlignmentV> & mat,
+               const viennacl::vector_base<NumericT> & vec,
+               NumericT alpha,
+                     viennacl::vector_base<NumericT> & result,
+               NumericT beta)
+{
+  NumericT           * result_buf   = detail::extract_raw_pointer<NumericT>(result.handle());
+  NumericT     const * vec_buf      = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT     const * elements     = detail::extract_raw_pointer<NumericT>(mat.handle());
+  unsigned int const * coords       = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+
+  for (vcl_size_t row = 0; row < mat.size1(); ++row)
+  {
+    NumericT sum = 0;
+
+    for (unsigned int item_id = 0; item_id < mat.internal_maxnnz(); ++item_id)
+    {
+      vcl_size_t offset = row + item_id * mat.internal_size1();
+      NumericT val = elements[offset];
+
+      if (val > 0 || val < 0)
+      {
+        unsigned int col = coords[offset];
+        sum += (vec_buf[col * vec.stride() + vec.start()] * val);
+      }
+    }
+
+    if (beta < 0 || beta > 0)
+    {
+      vcl_size_t index = row * result.stride() + result.start();
+      result_buf[index] = alpha * sum + beta * result_buf[index];
+    }
+    else
+      result_buf[row * result.stride() + result.start()] = alpha * sum;
+  }
+}
+
+/** @brief Carries out ell_matrix-d_matrix multiplication
+*
+* Implementation of the convenience expression result = prod(sp_mat, d_mat);
+*
+* @param sp_mat     The sparse(ELL) matrix
+* @param d_mat      The dense matrix
+* @param result     The result dense matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::ell_matrix<NumericT, AlignmentV> & sp_mat,
+               const viennacl::matrix_base<NumericT> & d_mat,
+                     viennacl::matrix_base<NumericT> & result)
+{
+  NumericT     const * sp_mat_elements     = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
+  unsigned int const * sp_mat_coords       = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
+
+  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
+  NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
+  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
+  vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat);
+  vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat);
+  vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat);
+  vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat);
+
+  vcl_size_t result_start1 = viennacl::traits::start1(result);
+  vcl_size_t result_start2 = viennacl::traits::start2(result);
+  vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+  vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+  vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+  vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+  detail::matrix_array_wrapper<NumericT const, row_major, false>
+      d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+  detail::matrix_array_wrapper<NumericT const, column_major, false>
+      d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+  detail::matrix_array_wrapper<NumericT, row_major, false>
+      result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+  detail::matrix_array_wrapper<NumericT, column_major, false>
+      result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+  if ( d_mat.row_major() ) {
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+    {
+      if (result.row_major())
+        for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+          result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+      else
+        for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+          result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+    }
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+    {
+      for (long item_id = 0; item_id < static_cast<long>(sp_mat.maxnnz()); ++item_id)
+      {
+        vcl_size_t offset = static_cast<vcl_size_t>(row) + static_cast<vcl_size_t>(item_id) * sp_mat.internal_size1();
+        NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
+        vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
+
+        if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
+        {
+          if (result.row_major())
+            for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+              result_wrapper_row(static_cast<vcl_size_t>(row), col) += sp_mat_val * d_mat_wrapper_row( sp_mat_col, col);
+          else
+            for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+              result_wrapper_col(static_cast<vcl_size_t>(row), col) += sp_mat_val * d_mat_wrapper_row( sp_mat_col, col);
+        }
+      }
+    }
+  }
+  else {
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
+    {
+      if (result.row_major())
+        for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+          result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+      else
+        for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+          result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+    }
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
+
+      for (unsigned int item_id = 0; item_id < sp_mat.maxnnz(); ++item_id) {
+
+        for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
+
+          vcl_size_t offset = row + item_id * sp_mat.internal_size1();
+          NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
+          vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
+
+          if (sp_mat_val < 0 || sp_mat_val > 0)  // sp_mat_val != 0 without compiler warnings
+          {
+            if (result.row_major())
+              result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_col( sp_mat_col, col);
+            else
+              result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_col( sp_mat_col, col);
+          }
+        }
+      }
+    }
+  }
+
+}
+
+/** @brief Carries out matrix-trans(matrix) multiplication first matrix being sparse ell
+*          and the second dense transposed
+*
+* Implementation of the convenience expression result = prod(sp_mat, trans(d_mat));
+*
+* @param sp_mat             The sparse matrix
+* @param d_mat              The transposed dense matrix
+* @param result             The result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::ell_matrix<NumericT, AlignmentV> & sp_mat,
+               const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+                                                  const viennacl::matrix_base<NumericT>,
+                                                  viennacl::op_trans > & d_mat,
+                     viennacl::matrix_base<NumericT> & result) {
+
+  NumericT     const * sp_mat_elements     = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
+  unsigned int const * sp_mat_coords       = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
+
+  NumericT const * d_mat_data  = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
+  NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
+  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
+  vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat.lhs());
+  vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat.lhs());
+  vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat.lhs());
+  vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat.lhs());
+
+  vcl_size_t result_start1 = viennacl::traits::start1(result);
+  vcl_size_t result_start2 = viennacl::traits::start2(result);
+  vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+  vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+  vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+  vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+  detail::matrix_array_wrapper<NumericT const, row_major, false>
+      d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+  detail::matrix_array_wrapper<NumericT const, column_major, false>
+      d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+  detail::matrix_array_wrapper<NumericT, row_major, false>
+      result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+  detail::matrix_array_wrapper<NumericT, column_major, false>
+      result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+  if ( d_mat.lhs().row_major() )
+  {
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+    {
+      if (result.row_major())
+        for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+          result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+      else
+        for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+          result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+    }
+
+    for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+
+      for (unsigned int item_id = 0; item_id < sp_mat.maxnnz(); ++item_id) {
+
+        for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
+
+          vcl_size_t offset = row + item_id * sp_mat.internal_size1();
+          NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
+          vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
+
+          if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
+          {
+            if (result.row_major())
+              result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_row( col, sp_mat_col);
+            else
+              result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_row( col, sp_mat_col);
+          }
+        }
+      }
+    }
+  }
+  else
+  {
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
+    {
+      if (result.row_major())
+        for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+          result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+      else
+        for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+          result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+    }
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+    for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
+
+      for (long item_id = 0; item_id < static_cast<long>(sp_mat.maxnnz()); ++item_id) {
+
+        vcl_size_t offset = row + static_cast<vcl_size_t>(item_id) * sp_mat.internal_size1();
+        NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
+        vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
+
+        if (sp_mat_val < 0 || sp_mat_val > 0)  // sp_mat_val != 0 without compiler warnings
+        {
+          if (result.row_major())
+            for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+              result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_col( col, sp_mat_col);
+          else
+            for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+              result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_col( col, sp_mat_col);
+        }
+      }
+    }
+  }
+
+}
+
+
+//
+// SELL-C-\sigma Matrix
+//
+/** @brief Carries out matrix-vector multiplication with a sliced_ell_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template<typename NumericT, typename IndexT>
+void prod_impl(const viennacl::sliced_ell_matrix<NumericT, IndexT> & mat,
+               const viennacl::vector_base<NumericT> & vec,
+               NumericT alpha,
+                     viennacl::vector_base<NumericT> & result,
+               NumericT beta)
+{
+  NumericT       * result_buf        = detail::extract_raw_pointer<NumericT>(result.handle());
+  NumericT const * vec_buf           = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT const * elements          = detail::extract_raw_pointer<NumericT>(mat.handle());
+  IndexT   const * columns_per_block = detail::extract_raw_pointer<IndexT>(mat.handle1());
+  IndexT   const * column_indices    = detail::extract_raw_pointer<IndexT>(mat.handle2());
+  IndexT   const * block_start       = detail::extract_raw_pointer<IndexT>(mat.handle3());
+
+  vcl_size_t num_blocks = mat.size1() / mat.rows_per_block() + 1;
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long block_idx2 = 0; block_idx2 < static_cast<long>(num_blocks); ++block_idx2)
+  {
+    vcl_size_t block_idx = static_cast<vcl_size_t>(block_idx2);
+    vcl_size_t current_columns_per_block = columns_per_block[block_idx];
+
+    std::vector<NumericT> result_values(mat.rows_per_block());
+
+    for (IndexT column_entry_index = 0;
+                column_entry_index < current_columns_per_block;
+              ++column_entry_index)
+    {
+      vcl_size_t stride_start = block_start[block_idx] + column_entry_index * mat.rows_per_block();
+      // Note: This for-loop may be unrolled by hand for exploiting vectorization
+      //       Careful benchmarking recommended first, memory channels may be saturated already!
+      for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)
+      {
+        NumericT val = elements[stride_start + row_in_block];
+
+        result_values[row_in_block] += (val > 0 || val < 0) ? vec_buf[column_indices[stride_start + row_in_block] * vec.stride() + vec.start()] * val : 0;
+      }
+    }
+
+    vcl_size_t first_row_in_matrix = block_idx * mat.rows_per_block();
+    if (beta < 0 || beta > 0)
+    {
+      for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)
+      {
+        if (first_row_in_matrix + row_in_block < result.size())
+        {
+          vcl_size_t index = (first_row_in_matrix + row_in_block) * result.stride() + result.start();
+          result_buf[index] = alpha * result_values[row_in_block] + beta * result_buf[index];
+        }
+      }
+    }
+    else
+    {
+      for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)
+      {
+        if (first_row_in_matrix + row_in_block < result.size())
+          result_buf[(first_row_in_matrix + row_in_block) * result.stride() + result.start()] = alpha * result_values[row_in_block];
+      }
+    }
+  }
+}
+
+
+//
+// Hybrid Matrix
+//
+/** @brief Carries out matrix-vector multiplication with a hyb_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::hyb_matrix<NumericT, AlignmentV> & mat,
+               const viennacl::vector_base<NumericT> & vec,
+               NumericT alpha,
+                     viennacl::vector_base<NumericT> & result,
+               NumericT beta)
+{
+  NumericT           * result_buf     = detail::extract_raw_pointer<NumericT>(result.handle());
+  NumericT     const * vec_buf        = detail::extract_raw_pointer<NumericT>(vec.handle());
+  NumericT     const * elements       = detail::extract_raw_pointer<NumericT>(mat.handle());
+  unsigned int const * coords         = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+  NumericT     const * csr_elements   = detail::extract_raw_pointer<NumericT>(mat.handle5());
+  unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
+  unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
+
+
+  for (vcl_size_t row = 0; row < mat.size1(); ++row)
+  {
+    NumericT sum = 0;
+
+    //
+    // Part 1: Process ELL part
+    //
+    for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
+    {
+      vcl_size_t offset = row + item_id * mat.internal_size1();
+      NumericT val = elements[offset];
+
+      if (val > 0 || val < 0)
+      {
+        unsigned int col = coords[offset];
+        sum += (vec_buf[col * vec.stride() + vec.start()] * val);
+      }
+    }
+
+    //
+    // Part 2: Process HYB part
+    //
+    vcl_size_t col_begin = csr_row_buffer[row];
+    vcl_size_t col_end   = csr_row_buffer[row + 1];
+
+    for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
+    {
+        sum += (vec_buf[csr_col_buffer[item_id] * vec.stride() + vec.start()] * csr_elements[item_id]);
+    }
+
+    if (beta < 0 || beta > 0)
+    {
+      vcl_size_t index = row * result.stride() + result.start();
+      result_buf[index] = alpha * sum + beta * result_buf[index];
+    }
+    else
+      result_buf[row * result.stride() + result.start()] = alpha * sum;
+  }
+
+}
+
+//
+// Hybrid Matrix
+//
+/** @brief Carries out sparse-matrix-dense-matrix multiplication with a hyb_matrix
+*
+* Implementation of the convenience expression C = prod(A, B);
+*
+* @param mat    The sparse matrix A
+* @param d_mat  The dense matrix B
+* @param result The dense result matrix C
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::hyb_matrix<NumericT, AlignmentV> & mat,
+               const viennacl::matrix_base<NumericT> & d_mat,
+                     viennacl::matrix_base<NumericT> & result)
+{
+  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
+  NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
+  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
+  vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat);
+  vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat);
+  vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat);
+  vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat);
+
+  vcl_size_t result_start1 = viennacl::traits::start1(result);
+  vcl_size_t result_start2 = viennacl::traits::start2(result);
+  vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+  vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+  vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+  vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+  detail::matrix_array_wrapper<NumericT const, row_major, false>
+      d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+  detail::matrix_array_wrapper<NumericT const, column_major, false>
+      d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+  detail::matrix_array_wrapper<NumericT, row_major, false>
+      result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+  detail::matrix_array_wrapper<NumericT, column_major, false>
+      result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+  NumericT     const * elements       = detail::extract_raw_pointer<NumericT>(mat.handle());
+  unsigned int const * coords         = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+  NumericT     const * csr_elements   = detail::extract_raw_pointer<NumericT>(mat.handle5());
+  unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
+  unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
+
+
+  for (vcl_size_t result_col = 0; result_col < result.size2(); ++result_col)
+  {
+    for (vcl_size_t row = 0; row < mat.size1(); ++row)
+    {
+      NumericT sum = 0;
+
+      //
+      // Part 1: Process ELL part
+      //
+      for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
+      {
+        vcl_size_t offset = row + item_id * mat.internal_size1();
+        NumericT val = elements[offset];
+
+        if (val < 0 || val > 0)  // val != 0 without compiler warnings
+        {
+          vcl_size_t col = static_cast<vcl_size_t>(coords[offset]);
+          if (d_mat.row_major())
+            sum += d_mat_wrapper_row(col, result_col) * val;
+          else
+            sum += d_mat_wrapper_col(col, result_col) * val;
+        }
+      }
+
+      //
+      // Part 2: Process HYB/CSR part
+      //
+      vcl_size_t col_begin = csr_row_buffer[row];
+      vcl_size_t col_end   = csr_row_buffer[row + 1];
+
+      if (d_mat.row_major())
+        for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
+          sum += d_mat_wrapper_row(static_cast<vcl_size_t>(csr_col_buffer[item_id]), result_col) * csr_elements[item_id];
+      else
+        for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
+          sum += d_mat_wrapper_col(static_cast<vcl_size_t>(csr_col_buffer[item_id]), result_col) * csr_elements[item_id];
+
+      if (result.row_major())
+        result_wrapper_row(row, result_col) = sum;
+      else
+        result_wrapper_col(row, result_col) = sum;
+    }
+  } // for result_col
+}
+
+
+/** @brief Carries out sparse-matrix-transposed-dense-matrix multiplication with a hyb_matrix
+*
+* Implementation of the convenience expression C = prod(A, trans(B));
+*
+* @param mat    The sparse matrix A
+* @param d_mat  The dense matrix B
+* @param result The dense result matrix C
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::hyb_matrix<NumericT, AlignmentV> & mat,
+               const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+                                                  const viennacl::matrix_base<NumericT>,
+                                                  viennacl::op_trans > & d_mat,
+                     viennacl::matrix_base<NumericT> & result)
+{
+  NumericT const * d_mat_data  = detail::extract_raw_pointer<NumericT>(d_mat);
+  NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
+  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
+  vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat.lhs());
+  vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat.lhs());
+  vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat.lhs());
+  vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat.lhs());
+
+  vcl_size_t result_start1 = viennacl::traits::start1(result);
+  vcl_size_t result_start2 = viennacl::traits::start2(result);
+  vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+  vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+  vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+  vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+  detail::matrix_array_wrapper<NumericT const, row_major, false>
+      d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+  detail::matrix_array_wrapper<NumericT const, column_major, false>
+      d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+  detail::matrix_array_wrapper<NumericT, row_major, false>
+      result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+  detail::matrix_array_wrapper<NumericT, column_major, false>
+      result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+  NumericT     const * elements       = detail::extract_raw_pointer<NumericT>(mat.handle());
+  unsigned int const * coords         = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+  NumericT     const * csr_elements   = detail::extract_raw_pointer<NumericT>(mat.handle5());
+  unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
+  unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
+
+
+  for (vcl_size_t result_col = 0; result_col < result.size2(); ++result_col)
+  {
+    for (vcl_size_t row = 0; row < mat.size1(); ++row)
+    {
+      NumericT sum = 0;
+
+      //
+      // Part 1: Process ELL part
+      //
+      for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
+      {
+        vcl_size_t offset = row + item_id * mat.internal_size1();
+        NumericT val = elements[offset];
+
+        if (val < 0 || val > 0)  // val != 0 without compiler warnings
+        {
+          vcl_size_t col = static_cast<vcl_size_t>(coords[offset]);
+          if (d_mat.lhs().row_major())
+            sum += d_mat_wrapper_row(result_col, col) * val;
+          else
+            sum += d_mat_wrapper_col(result_col, col) * val;
+        }
+      }
+
+      //
+      // Part 2: Process HYB/CSR part
+      //
+      vcl_size_t col_begin = csr_row_buffer[row];
+      vcl_size_t col_end   = csr_row_buffer[row + 1];
+
+      if (d_mat.lhs().row_major())

<TRUNCATED>

[51/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
(nojira) add native-viennaCL module to codebase. closes apache/mahout#241


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/f7c1f802
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/f7c1f802
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/f7c1f802

Branch: refs/heads/master
Commit: f7c1f8026296bae1eab9768564a871ea1dd6583b
Parents: 1fca074
Author: Andrew Palumbo <ap...@apache.org>
Authored: Wed Jun 8 17:37:01 2016 -0400
Committer: Andrew Palumbo <ap...@apache.org>
Committed: Wed Jun 8 17:39:19 2016 -0400

----------------------------------------------------------------------
 LICENSE.txt                                     |   29 +
 math-scala/pom.xml                              |    5 +
 .../scalabindings/viennacl/ViennaCLTests.scala  |   40 +
 native-viennaCL/pom.xml                         |  249 ++
 native-viennaCL/src/main/cpp/HelloNative.h      |   10 +
 .../src/main/cpp/libviennacl/CMakeLists.txt     |   40 +
 .../main/cpp/libviennacl/include/viennacl.hpp   |  617 +++
 .../src/main/cpp/libviennacl/src/backend.cpp    |   46 +
 .../src/main/cpp/libviennacl/src/backend.cu     |   46 +
 .../src/main/cpp/libviennacl/src/blas1.cpp      |  420 ++
 .../src/main/cpp/libviennacl/src/blas1.cu       |  420 ++
 .../src/main/cpp/libviennacl/src/blas1_cuda.cu  |  264 ++
 .../src/main/cpp/libviennacl/src/blas1_host.cpp |  293 ++
 .../src/main/cpp/libviennacl/src/blas1_host.cu  |  293 ++
 .../main/cpp/libviennacl/src/blas1_opencl.cpp   |  297 ++
 .../main/cpp/libviennacl/src/blas1_opencl.cu    |  297 ++
 .../src/main/cpp/libviennacl/src/blas2.cpp      |  231 +
 .../src/main/cpp/libviennacl/src/blas2.cu       |  231 +
 .../src/main/cpp/libviennacl/src/blas2_cuda.cu  |  204 +
 .../src/main/cpp/libviennacl/src/blas2_host.cpp |  219 +
 .../src/main/cpp/libviennacl/src/blas2_host.cu  |  219 +
 .../main/cpp/libviennacl/src/blas2_opencl.cpp   |  219 +
 .../main/cpp/libviennacl/src/blas2_opencl.cu    |  219 +
 .../src/main/cpp/libviennacl/src/blas3.cpp      |  272 ++
 .../src/main/cpp/libviennacl/src/blas3.cu       |  272 ++
 .../src/main/cpp/libviennacl/src/blas3.hpp      |   60 +
 .../src/main/cpp/libviennacl/src/blas3_cuda.cu  |  133 +
 .../src/main/cpp/libviennacl/src/blas3_host.cpp |  131 +
 .../src/main/cpp/libviennacl/src/blas3_host.cu  |  131 +
 .../main/cpp/libviennacl/src/blas3_opencl.cpp   |  136 +
 .../main/cpp/libviennacl/src/blas3_opencl.cu    |  136 +
 .../main/cpp/libviennacl/src/init_matrix.hpp    |  101 +
 .../main/cpp/libviennacl/src/init_vector.hpp    |  101 +
 .../cpp/libviennacl/src/viennacl_private.hpp    |  141 +
 .../src/main/cpp/viennacl/backend/cpu_ram.hpp   |  171 +
 .../src/main/cpp/viennacl/backend/cuda.hpp      |  206 +
 .../main/cpp/viennacl/backend/mem_handle.hpp    |  250 ++
 .../src/main/cpp/viennacl/backend/memory.hpp    |  628 +++
 .../src/main/cpp/viennacl/backend/opencl.hpp    |  151 +
 .../src/main/cpp/viennacl/backend/util.hpp      |  268 ++
 .../src/main/cpp/viennacl/circulant_matrix.hpp  |  359 ++
 .../viennacl/compressed_compressed_matrix.hpp   |  619 +++
 .../src/main/cpp/viennacl/compressed_matrix.hpp | 1178 ++++++
 .../src/main/cpp/viennacl/context.hpp           |   88 +
 .../src/main/cpp/viennacl/coordinate_matrix.hpp |  506 +++
 .../src/main/cpp/viennacl/detail/matrix_def.hpp |  270 ++
 .../src/main/cpp/viennacl/detail/vector_def.hpp |  349 ++
 .../device_specific/builtin_database/common.hpp |  219 +
 .../devices/accelerator/fallback.hpp            |   85 +
 .../builtin_database/devices/cpu/fallback.hpp   |   84 +
 .../devices/gpu/amd/evergreen/cedar.hpp         |   64 +
 .../devices/gpu/amd/evergreen/cypress.hpp       |   65 +
 .../devices/gpu/amd/northern_islands/barts.hpp  |   64 +
 .../gpu/amd/northern_islands/devastator.hpp     |   64 +
 .../gpu/amd/northern_islands/scrapper.hpp       |   64 +
 .../devices/gpu/amd/southern_islands/tahiti.hpp |   84 +
 .../devices/gpu/amd/volcanic_islands/hawaii.hpp |   84 +
 .../builtin_database/devices/gpu/fallback.hpp   |   84 +
 .../gpu/nvidia/fermi/geforce_gt_540m.hpp        |   59 +
 .../gpu/nvidia/fermi/geforce_gtx_470.hpp        |   83 +
 .../gpu/nvidia/fermi/geforce_gtx_580.hpp        |   84 +
 .../devices/gpu/nvidia/fermi/tesla_c2050.hpp    |   84 +
 .../devices/gpu/nvidia/kepler/tesla_k20m.hpp    |   84 +
 .../gpu/nvidia/maxwell/geforce_gtx_750_ti.hpp   |   85 +
 .../gpu/nvidia/tesla/geforce_gtx_260.hpp        |   84 +
 .../builtin_database/matrix_product.hpp         |  244 ++
 .../cpp/viennacl/device_specific/execute.hpp    |   55 +
 .../device_specific/execution_handler.hpp       |  102 +
 .../cpp/viennacl/device_specific/forwards.h     |  294 ++
 .../device_specific/lazy_program_compiler.hpp   |   74 +
 .../viennacl/device_specific/mapped_objects.hpp |  512 +++
 .../templates/matrix_product_template.hpp       |  859 ++++
 .../device_specific/templates/template_base.hpp |  596 +++
 .../device_specific/templates/utils.hpp         |  105 +
 .../viennacl/device_specific/tree_parsing.hpp   |  512 +++
 .../main/cpp/viennacl/device_specific/utils.hpp |  568 +++
 .../src/main/cpp/viennacl/ell_matrix.hpp        |  362 ++
 native-viennaCL/src/main/cpp/viennacl/fft.hpp   |  282 ++
 .../src/main/cpp/viennacl/forwards.h            | 1032 +++++
 .../src/main/cpp/viennacl/hankel_matrix.hpp     |  343 ++
 .../src/main/cpp/viennacl/hyb_matrix.hpp        |  442 ++
 .../src/main/cpp/viennacl/io/matrix_market.hpp  |  440 ++
 .../src/main/cpp/viennacl/linalg/amg.hpp        |  398 ++
 .../main/cpp/viennacl/linalg/amg_operations.hpp |  238 ++
 .../src/main/cpp/viennacl/linalg/bicgstab.hpp   |  598 +++
 .../src/main/cpp/viennacl/linalg/bisect.hpp     |  179 +
 .../src/main/cpp/viennacl/linalg/bisect_gpu.hpp |  173 +
 .../src/main/cpp/viennacl/linalg/cg.hpp         |  440 ++
 .../linalg/circulant_matrix_operations.hpp      |   75 +
 .../cpp/viennacl/linalg/cuda/amg_operations.hpp |  821 ++++
 .../linalg/cuda/bisect_kernel_calls.hpp         |  166 +
 .../linalg/cuda/bisect_kernel_large.hpp         |  928 ++++
 .../linalg/cuda/bisect_kernel_large_multi.hpp   |  277 ++
 .../linalg/cuda/bisect_kernel_large_onei.hpp    |  180 +
 .../linalg/cuda/bisect_kernel_small.hpp         |  261 ++
 .../cpp/viennacl/linalg/cuda/bisect_util.hpp    |  613 +++
 .../main/cpp/viennacl/linalg/cuda/common.hpp    |  250 ++
 .../cpp/viennacl/linalg/cuda/direct_solve.hpp   |  412 ++
 .../cpp/viennacl/linalg/cuda/fft_operations.hpp |  858 ++++
 .../cpp/viennacl/linalg/cuda/ilu_operations.hpp |  666 +++
 .../linalg/cuda/iterative_operations.hpp        | 2049 +++++++++
 .../viennacl/linalg/cuda/matrix_operations.hpp  | 2725 ++++++++++++
 .../linalg/cuda/matrix_operations_col.hpp       | 1847 ++++++++
 .../linalg/cuda/matrix_operations_prod.hpp      | 2887 +++++++++++++
 .../linalg/cuda/matrix_operations_row.hpp       | 1468 +++++++
 .../viennacl/linalg/cuda/misc_operations.hpp    |   91 +
 .../cpp/viennacl/linalg/cuda/nmf_operations.hpp |  152 +
 .../viennacl/linalg/cuda/scalar_operations.hpp  |  375 ++
 .../linalg/cuda/sparse_matrix_operations.hpp    | 2809 +++++++++++++
 .../cuda/sparse_matrix_operations_solve.hpp     |  761 ++++
 .../main/cpp/viennacl/linalg/cuda/spgemm.hpp    |  793 ++++
 .../cpp/viennacl/linalg/cuda/spgemm_rmerge.hpp  |  669 +++
 .../viennacl/linalg/cuda/vector_operations.hpp  | 3252 ++++++++++++++
 .../cpp/viennacl/linalg/detail/amg/amg_base.hpp |  208 +
 .../detail/bisect/bisect_kernel_calls.hpp       |  191 +
 .../linalg/detail/bisect/bisect_large.hpp       |  142 +
 .../linalg/detail/bisect/bisect_small.hpp       |   96 +
 .../viennacl/linalg/detail/bisect/config.hpp    |   44 +
 .../linalg/detail/bisect/gerschgorin.hpp        |   94 +
 .../viennacl/linalg/detail/bisect/structs.hpp   |  182 +
 .../cpp/viennacl/linalg/detail/bisect/util.hpp  |  106 +
 .../viennacl/linalg/detail/ilu/block_ilu.hpp    |  617 +++
 .../linalg/detail/ilu/chow_patel_ilu.hpp        |  316 ++
 .../cpp/viennacl/linalg/detail/ilu/common.hpp   |  263 ++
 .../cpp/viennacl/linalg/detail/ilu/ilu0.hpp     |  379 ++
 .../cpp/viennacl/linalg/detail/ilu/ilut.hpp     |  597 +++
 .../cpp/viennacl/linalg/detail/op_applier.hpp   |  103 +
 .../cpp/viennacl/linalg/detail/op_executor.hpp  |   86 +
 .../linalg/detail/spai/block_matrix.hpp         |   86 +
 .../linalg/detail/spai/block_vector.hpp         |   77 +
 .../cpp/viennacl/linalg/detail/spai/fspai.hpp   |  402 ++
 .../main/cpp/viennacl/linalg/detail/spai/qr.hpp |  497 +++
 .../linalg/detail/spai/small_matrix.hpp         |  113 +
 .../linalg/detail/spai/spai-dynamic.hpp         |  687 +++
 .../viennacl/linalg/detail/spai/spai-static.hpp |  192 +
 .../cpp/viennacl/linalg/detail/spai/spai.hpp    |  832 ++++
 .../viennacl/linalg/detail/spai/spai_tag.hpp    |  143 +
 .../linalg/detail/spai/sparse_vector.hpp        |   85 +
 .../main/cpp/viennacl/linalg/direct_solve.hpp   |  580 +++
 .../src/main/cpp/viennacl/linalg/eig.hpp        |   29 +
 .../main/cpp/viennacl/linalg/fft_operations.hpp |  481 +++
 .../src/main/cpp/viennacl/linalg/gmres.hpp      |  738 ++++
 .../linalg/hankel_matrix_operations.hpp         |   66 +
 .../linalg/host_based/amg_operations.hpp        | 1123 +++++
 .../cpp/viennacl/linalg/host_based/common.hpp   |  149 +
 .../viennacl/linalg/host_based/direct_solve.hpp |  307 ++
 .../linalg/host_based/fft_operations.hpp        |  856 ++++
 .../linalg/host_based/ilu_operations.hpp        |  672 +++
 .../linalg/host_based/iterative_operations.hpp  |  880 ++++
 .../linalg/host_based/matrix_operations.hpp     | 2052 +++++++++
 .../linalg/host_based/misc_operations.hpp       |   80 +
 .../linalg/host_based/nmf_operations.hpp        |  247 ++
 .../linalg/host_based/scalar_operations.hpp     |  162 +
 .../host_based/sparse_matrix_operations.hpp     | 2081 +++++++++
 .../linalg/host_based/spgemm_vector.hpp         |  705 ++++
 .../linalg/host_based/vector_operations.hpp     | 1188 ++++++
 .../src/main/cpp/viennacl/linalg/ichol.hpp      |  228 +
 .../src/main/cpp/viennacl/linalg/ilu.hpp        |   33 +
 .../main/cpp/viennacl/linalg/ilu_operations.hpp |  334 ++
 .../src/main/cpp/viennacl/linalg/inner_prod.hpp |  186 +
 .../viennacl/linalg/iterative_operations.hpp    |  425 ++
 .../main/cpp/viennacl/linalg/jacobi_precond.hpp |  141 +
 .../src/main/cpp/viennacl/linalg/lanczos.hpp    |  515 +++
 .../src/main/cpp/viennacl/linalg/lu.hpp         |  227 +
 .../cpp/viennacl/linalg/matrix_operations.hpp   | 1303 ++++++
 .../src/main/cpp/viennacl/linalg/maxmin.hpp     |  152 +
 .../cpp/viennacl/linalg/misc_operations.hpp     |   94 +
 .../cpp/viennacl/linalg/mixed_precision_cg.hpp  |  199 +
 .../src/main/cpp/viennacl/linalg/nmf.hpp        |   91 +
 .../src/main/cpp/viennacl/linalg/norm_1.hpp     |  104 +
 .../src/main/cpp/viennacl/linalg/norm_2.hpp     |  140 +
 .../main/cpp/viennacl/linalg/norm_frobenius.hpp |   73 +
 .../src/main/cpp/viennacl/linalg/norm_inf.hpp   |  108 +
 .../viennacl/linalg/opencl/amg_operations.hpp   |  458 ++
 .../linalg/opencl/bisect_kernel_calls.hpp       |  177 +
 .../main/cpp/viennacl/linalg/opencl/common.hpp  |  102 +
 .../cpp/viennacl/linalg/opencl/direct_solve.hpp |  153 +
 .../viennacl/linalg/opencl/fft_operations.hpp   |  350 ++
 .../viennacl/linalg/opencl/ilu_operations.hpp   |  260 ++
 .../linalg/opencl/iterative_operations.hpp      |  945 +++++
 .../cpp/viennacl/linalg/opencl/kernels/amg.hpp  |  393 ++
 .../viennacl/linalg/opencl/kernels/bisect.hpp   | 2645 ++++++++++++
 .../kernels/compressed_compressed_matrix.hpp    |  110 +
 .../linalg/opencl/kernels/compressed_matrix.hpp | 1703 ++++++++
 .../linalg/opencl/kernels/coordinate_matrix.hpp |  405 ++
 .../linalg/opencl/kernels/ell_matrix.hpp        |  221 +
 .../cpp/viennacl/linalg/opencl/kernels/fft.hpp  |  311 ++
 .../linalg/opencl/kernels/hyb_matrix.hpp        |  240 ++
 .../cpp/viennacl/linalg/opencl/kernels/ilu.hpp  |  505 +++
 .../linalg/opencl/kernels/iterative.hpp         | 1619 +++++++
 .../viennacl/linalg/opencl/kernels/matrix.hpp   | 1193 ++++++
 .../linalg/opencl/kernels/matrix_element.hpp    |  138 +
 .../linalg/opencl/kernels/matrix_solve.hpp      |  180 +
 .../cpp/viennacl/linalg/opencl/kernels/nmf.hpp  |   99 +
 .../viennacl/linalg/opencl/kernels/scalar.hpp   |  283 ++
 .../cpp/viennacl/linalg/opencl/kernels/scan.hpp |  194 +
 .../linalg/opencl/kernels/sliced_ell_matrix.hpp |  135 +
 .../cpp/viennacl/linalg/opencl/kernels/spai.hpp |  631 +++
 .../cpp/viennacl/linalg/opencl/kernels/svd.hpp  |  703 ++++
 .../viennacl/linalg/opencl/kernels/vector.hpp   |  867 ++++
 .../linalg/opencl/kernels/vector_element.hpp    |  163 +
 .../linalg/opencl/matrix_operations.hpp         | 1019 +++++
 .../viennacl/linalg/opencl/misc_operations.hpp  |   69 +
 .../viennacl/linalg/opencl/nmf_operations.hpp   |  139 +
 .../linalg/opencl/scalar_operations.hpp         |  205 +
 .../linalg/opencl/sparse_matrix_operations.hpp  | 1244 ++++++
 .../opencl/vandermonde_matrix_operations.hpp    |   68 +
 .../linalg/opencl/vector_operations.hpp         | 1263 ++++++
 .../src/main/cpp/viennacl/linalg/power_iter.hpp |  129 +
 .../src/main/cpp/viennacl/linalg/prod.hpp       |  370 ++
 .../cpp/viennacl/linalg/qr-method-common.hpp    |  188 +
 .../src/main/cpp/viennacl/linalg/qr-method.hpp  |  833 ++++
 .../src/main/cpp/viennacl/linalg/qr.hpp         |  669 +++
 .../main/cpp/viennacl/linalg/row_scaling.hpp    |  204 +
 .../cpp/viennacl/linalg/scalar_operations.hpp   |  242 ++
 .../src/main/cpp/viennacl/linalg/spai.hpp       |  292 ++
 .../linalg/sparse_matrix_operations.hpp         |  421 ++
 .../src/main/cpp/viennacl/linalg/sum.hpp        |  136 +
 .../src/main/cpp/viennacl/linalg/svd.hpp        |  533 +++
 .../linalg/toeplitz_matrix_operations.hpp       |   78 +
 .../src/main/cpp/viennacl/linalg/tql2.hpp       |  262 ++
 .../linalg/vandermonde_matrix_operations.hpp    |   71 +
 .../cpp/viennacl/linalg/vector_operations.hpp   | 1304 ++++++
 .../src/main/cpp/viennacl/matrix.hpp            | 3968 ++++++++++++++++++
 .../src/main/cpp/viennacl/matrix_proxy.hpp      |  595 +++
 .../src/main/cpp/viennacl/meta/enable_if.hpp    |   43 +
 .../src/main/cpp/viennacl/meta/predicate.hpp    |  534 +++
 .../src/main/cpp/viennacl/meta/result_of.hpp    |  631 +++
 .../src/main/cpp/viennacl/meta/tag_of.hpp       |  390 ++
 .../cpp/viennacl/misc/bandwidth_reduction.hpp   |   38 +
 .../main/cpp/viennacl/misc/cuthill_mckee.hpp    |  611 +++
 .../viennacl/misc/gibbs_poole_stockmeyer.hpp    |  426 ++
 .../src/main/cpp/viennacl/ocl/backend.hpp       |  358 ++
 .../src/main/cpp/viennacl/ocl/command_queue.hpp |   92 +
 .../src/main/cpp/viennacl/ocl/context.hpp       |  820 ++++
 .../src/main/cpp/viennacl/ocl/device.hpp        | 1496 +++++++
 .../src/main/cpp/viennacl/ocl/device_utils.hpp  |  191 +
 .../src/main/cpp/viennacl/ocl/enqueue.hpp       |  148 +
 .../src/main/cpp/viennacl/ocl/error.hpp         |  687 +++
 .../src/main/cpp/viennacl/ocl/forwards.h        |   63 +
 .../src/main/cpp/viennacl/ocl/handle.hpp        |  228 +
 .../src/main/cpp/viennacl/ocl/kernel.hpp        |  805 ++++
 .../src/main/cpp/viennacl/ocl/local_mem.hpp     |   51 +
 .../src/main/cpp/viennacl/ocl/platform.hpp      |  145 +
 .../src/main/cpp/viennacl/ocl/program.hpp       |   86 +
 .../src/main/cpp/viennacl/ocl/utils.hpp         |   85 +
 native-viennaCL/src/main/cpp/viennacl/range.hpp |   76 +
 .../src/main/cpp/viennacl/scalar.hpp            |  874 ++++
 .../src/main/cpp/viennacl/scheduler/execute.hpp |  290 ++
 .../cpp/viennacl/scheduler/execute_axbx.hpp     |  381 ++
 .../viennacl/scheduler/execute_elementwise.hpp  |  410 ++
 .../scheduler/execute_generic_dispatcher.hpp    |  134 +
 .../scheduler/execute_matrix_dispatcher.hpp     |  185 +
 .../viennacl/scheduler/execute_matrix_prod.hpp  |  406 ++
 .../scheduler/execute_scalar_assign.hpp         |  192 +
 .../scheduler/execute_scalar_dispatcher.hpp     |  131 +
 .../cpp/viennacl/scheduler/execute_util.hpp     |  324 ++
 .../scheduler/execute_vector_dispatcher.hpp     |  200 +
 .../src/main/cpp/viennacl/scheduler/forwards.h  |  830 ++++
 .../src/main/cpp/viennacl/scheduler/io.hpp      |  268 ++
 .../src/main/cpp/viennacl/scheduler/preset.hpp  |  105 +
 native-viennaCL/src/main/cpp/viennacl/slice.hpp |   77 +
 .../src/main/cpp/viennacl/sliced_ell_matrix.hpp |  383 ++
 .../src/main/cpp/viennacl/toeplitz_matrix.hpp   |  384 ++
 .../src/main/cpp/viennacl/tools/adapter.hpp     |  421 ++
 .../src/main/cpp/viennacl/tools/entry_proxy.hpp |  234 ++
 .../cpp/viennacl/tools/matrix_generation.hpp    |  146 +
 .../cpp/viennacl/tools/matrix_size_deducer.hpp  |  204 +
 .../src/main/cpp/viennacl/tools/random.hpp      |   84 +
 .../src/main/cpp/viennacl/tools/sha1.hpp        |  237 ++
 .../src/main/cpp/viennacl/tools/shared_ptr.hpp  |  179 +
 .../src/main/cpp/viennacl/tools/timer.hpp       |  123 +
 .../src/main/cpp/viennacl/tools/tools.hpp       |  315 ++
 .../src/main/cpp/viennacl/traits/clear.hpp      |   62 +
 .../src/main/cpp/viennacl/traits/context.hpp    |   65 +
 .../src/main/cpp/viennacl/traits/fill.hpp       |   68 +
 .../src/main/cpp/viennacl/traits/handle.hpp     |  270 ++
 .../src/main/cpp/viennacl/traits/row_major.hpp  |   50 +
 .../src/main/cpp/viennacl/traits/size.hpp       |  442 ++
 .../src/main/cpp/viennacl/traits/start.hpp      |  102 +
 .../src/main/cpp/viennacl/traits/stride.hpp     |   72 +
 .../main/cpp/viennacl/vandermonde_matrix.hpp    |  342 ++
 .../src/main/cpp/viennacl/vector.hpp            | 3235 ++++++++++++++
 .../src/main/cpp/viennacl/vector_proxy.hpp      |  340 ++
 .../src/main/cpp/viennacl/version.hpp           |   31 +
 .../apache/mahout/javaCppTest/HelloNative.scala |   28 +
 .../mahout/javaCppTest/ViennaMatrix.scala       |   34 +
 .../apache/mahout/javaCppTest/ViennaSvd.scala   |    8 +
 .../mahout/javaCppTest/ViennaVector.scala       |    8 +
 .../javaCppTest/HelloNativeTestSuite.scala      |   15 +
 pom.xml                                         |    1 +
 runtests.sh                                     |    1 +
 292 files changed, 120789 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/LICENSE.txt
----------------------------------------------------------------------
diff --git a/LICENSE.txt b/LICENSE.txt
index 8ce7fff..dcc3c4e 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -799,3 +799,32 @@ The following license applies to the H2O package
       identification within third-party archives.
 
    Copyright 2012 0xdata, Inc
+
+================================================================
+The following applies to the ViennaCL library and files in the native-ViennaCL module
+================================================================
+
+   Copyright (c) 2010-2016 Institute for Microelectronics,
+                           Institute for Analysis and Scientific Computing, TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+   Argonne National Laboratory, with facilities in the state of Illinois,
+   is owned by The United States Government, and operated by UChicago Argonne, LLC
+   under provision of a contract with the Department of Energy.
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/math-scala/pom.xml
----------------------------------------------------------------------
diff --git a/math-scala/pom.xml b/math-scala/pom.xml
index 280a7a1..de96b03 100644
--- a/math-scala/pom.xml
+++ b/math-scala/pom.xml
@@ -122,6 +122,11 @@
       <artifactId>mahout-math</artifactId>
     </dependency>
 
+    <!--<dependency>-->
+      <!--<groupId>org.apache.mahout</groupId>-->
+      <!--<artifactId>mahout-native-viennacl_2.10</artifactId>-->
+    <!--</dependency>-->
+
     <dependency>
       <groupId>com.esotericsoftware.kryo</groupId>
       <artifactId>kryo</artifactId>

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/viennacl/ViennaCLTests.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/viennacl/ViennaCLTests.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/viennacl/ViennaCLTests.scala
new file mode 100644
index 0000000..aa7466e
--- /dev/null
+++ b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/viennacl/ViennaCLTests.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.math.scalabindings.viennacl
+
+
+import org.apache.mahout.logging._
+import org.apache.mahout.math._
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.mahout.test.MahoutSuite
+import org.scalatest.FunSuite
+//import org.apache.mahout.javaCppTest._
+
+import scala.math._
+
+class ViennaCLTests extends FunSuite with MahoutSuite {
+
+  private final implicit val log = getLog(classOf[ViennaCLTests])
+
+//  test("HelloNative"){
+//    val nTest = new HelloNative
+//
+//    nTest.set_property("Hello Native")
+//    assert(nTest.get_property() == "Hello Native")
+//  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/pom.xml
----------------------------------------------------------------------
diff --git a/native-viennaCL/pom.xml b/native-viennaCL/pom.xml
new file mode 100644
index 0000000..a144adc
--- /dev/null
+++ b/native-viennaCL/pom.xml
@@ -0,0 +1,249 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.mahout</groupId>
+    <artifactId>mahout</artifactId>
+    <version>0.12.2-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <!--<artifactId>mahout-native-viennacl_${scala.compat.version}</artifactId>-->
+  <artifactId>mahout-native-viennacl_2.10</artifactId>
+
+  <name>Mahout Native VienniaCL Bindings</name>
+  <description>Native Structures and interfaces to be used from Mahout math-scala.
+  </description>
+
+  <packaging>jar</packaging>
+
+  <build>
+    <plugins>
+      <!-- create test jar so other modules can reuse the native test utility classes. -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <executions>
+          <execution>
+            <goals>
+              <goal>test-jar</goal>
+            </goals>
+            <phase>package</phase>
+          </execution>
+        </executions>
+      </plugin>
+
+      <plugin>
+        <artifactId>maven-javadoc-plugin</artifactId>
+      </plugin>
+
+      <plugin>
+        <artifactId>maven-source-plugin</artifactId>
+      </plugin>
+
+      <plugin>
+        <groupId>net.alchim31.maven</groupId>
+        <artifactId>scala-maven-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>add-scala-sources</id>
+            <phase>initialize</phase>
+            <goals>
+              <goal>add-source</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>scala-compile</id>
+            <phase>process-resources</phase>
+            <goals>
+              <goal>compile</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>scala-test-compile</id>
+            <phase>process-test-resources</phase>
+            <goals>
+              <goal>testCompile</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+
+      <!--this is what scalatest recommends to do to enable scala tests -->
+
+      <!-- disable surefire -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <configuration>
+          <skipTests>true</skipTests>
+        </configuration>
+      </plugin>
+      <!-- enable scalatest -->
+      <plugin>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>test</id>
+            <goals>
+              <goal>test</goal>
+            </goals>
+          </execution>
+
+        </executions>
+      </plugin>
+
+
+
+      <!--JavaCPP native build plugin-->
+      <!--  old-style way to get it to compile. -->
+      <!--based on https://github.com/bytedeco/javacpp/wiki/Maven-->
+      <plugin>
+          <groupId>org.codehaus.mojo</groupId>
+          <artifactId>exec-maven-plugin</artifactId>
+          <version>1.2.1</version>
+          <executions>
+              <execution>
+                  <id>javacpp</id>
+                  <phase>process-classes</phase>
+                  <goals>
+                      <goal>exec</goal>
+                  </goals>
+                  <configuration>
+                      <executable>java</executable>
+                      <arguments>
+                          <argument>-jar</argument>
+                          <argument>${org.bytedeco:javacpp:jar}</argument>
+                          <argument>-classpath</argument>
+                          <argument>${project.build.outputDirectory}</argument>
+                          <argument>-Xcompiler</argument>
+                          <argument>-I${basedir}/src/main/cpp</argument>
+                          <!--<argument>-I${basedir}/src/main/cpp/viennacl</argument>-->
+                          <!--<argument>-I${basedir}/src/main/cpp/viennacl/linalg</argument>-->
+                          <!--<argument>-I${basedir}/src/main/cpp/libviennacl/include</argument>-->
+                          <!--<argument>-I${basedir}/src/main/cpp/libviennacl/include/src</argument>-->
+                      </arguments>
+                  </configuration>
+              </execution>
+          </executions>
+      </plugin>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+        <version>2.3</version>
+        <executions>
+          <execution>
+            <goals>
+              <goal>properties</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>exec-maven-plugin</artifactId>
+        <version>1.2.1</version>
+      </plugin>
+
+    </plugins>
+
+  </build>
+
+  <dependencies>
+
+    <!--  3rd-party -->
+    <dependency>
+      <groupId>log4j</groupId>
+      <artifactId>log4j</artifactId>
+    </dependency>
+
+    <!-- scala stuff -->
+    <dependency>
+      <groupId>org.scalatest</groupId>
+      <artifactId>scalatest_${scala.compat.version}</artifactId>
+    </dependency>
+
+
+    <dependency>
+      <groupId>org.bytedeco</groupId>
+      <artifactId>javacpp</artifactId>
+      <version>1.2.1</version>
+    </dependency>
+
+  </dependencies>
+
+
+  <profiles>
+    <profile>
+      <id>mahout-release</id>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>net.alchim31.maven</groupId>
+            <artifactId>scala-maven-plugin</artifactId>
+            <executions>
+              <execution>
+                <id>generate-scaladoc</id>
+                <goals>
+                  <goal>doc</goal>
+                </goals>
+              </execution>
+              <execution>
+                <id>attach-scaladoc-jar</id>
+                <goals>
+                  <goal>doc-jar</goal>
+                </goals>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
+    <profile>
+      <id>travis</id>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-surefire-plugin</artifactId>
+            <configuration>
+              <!-- Limit memory for unit tests in Travis -->
+              <argLine>-Xmx3g</argLine>
+              <!--<argLine>-Djava.library.path=${project.build.directory}/libs/natives/linux-x86_64:${project.build.directory}/libs/natives/linux:${project.build.directory}/libs/natives/maxosx</argLine>-->
+            </configuration>
+          </plugin>
+          <plugin>
+            <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-failsafe-plugin</artifactId>
+            <configuration>
+              <!-- Limit memory for integration tests in Travis -->
+              <argLine>-Xmx3g</argLine>
+              <!--<argLine>-Djava.library.path=${project.build.directory}/libs/natives/linux-x86_64:${project.build.directory}/libs/natives/linux:${project.build.directory}/libs/natives/maxosx</argLine>-->
+            </configuration>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
+  </profiles>
+</project>

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/HelloNative.h
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/HelloNative.h b/native-viennaCL/src/main/cpp/HelloNative.h
new file mode 100644
index 0000000..d101f94
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/HelloNative.h
@@ -0,0 +1,10 @@
+#include <string>
+
+namespace HelloNative {
+    class HelloNative {
+        public:
+            const std::string& get_property() { return property; }
+            void set_property(const std::string& property) { this->property = property; }
+            std::string property;
+    };
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/CMakeLists.txt b/native-viennaCL/src/main/cpp/libviennacl/CMakeLists.txt
new file mode 100644
index 0000000..d2f83e5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/CMakeLists.txt
@@ -0,0 +1,40 @@
+
+include_directories(${PROJECT_SOURCE_DIR}/libviennacl/include/)
+
+if(ENABLE_CUDA)
+
+  if(ENABLE_OPENCL)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-DVIENNACL_WITH_OPENCL") #set flags before setting executable!
+    cuda_add_library(viennacl SHARED src/backend.cu
+                                     src/blas1.cu src/blas1_host.cu src/blas1_cuda.cu src/blas1_opencl.cu
+                                     src/blas2.cu src/blas2_host.cu src/blas2_cuda.cu src/blas2_opencl.cu
+                                     src/blas3.cu src/blas3_host.cu src/blas3_cuda.cu src/blas3_opencl.cu)
+    set_target_properties(viennacl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL -DVIENNACL_WITH_CUDA")
+    target_link_libraries(viennacl ${OPENCL_LIBRARIES})
+  else(ENABLE_OPENCL)
+    cuda_add_library(viennacl SHARED src/backend.cu
+                                     src/blas1.cu src/blas1_host.cu src/blas1_cuda.cu
+                                     src/blas2.cu src/blas2_host.cu src/blas2_cuda.cu
+                                     src/blas3.cu src/blas3_host.cu src/blas3_cuda.cu)
+    set_target_properties(viennacl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_CUDA")
+  endif(ENABLE_OPENCL)
+else(ENABLE_CUDA)
+  if(ENABLE_OPENCL)
+    add_library(viennacl SHARED src/backend.cpp
+                                src/blas1.cpp src/blas1_host.cpp src/blas1_opencl.cpp
+                                src/blas2.cpp src/blas2_host.cpp src/blas2_opencl.cpp
+                                src/blas3.cpp src/blas3_host.cpp src/blas3_opencl.cpp)
+    set_target_properties(viennacl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+    target_link_libraries(viennacl ${OPENCL_LIBRARIES})
+  else(ENABLE_OPENCL)
+    add_library(viennacl SHARED src/backend.cpp
+                                src/blas1.cpp src/blas1_host.cpp
+                                src/blas2.cpp src/blas2_host.cpp
+                                src/blas3.cpp src/blas3_host.cpp)
+  endif(ENABLE_OPENCL)
+endif(ENABLE_CUDA)
+
+# Special linkage for OpenMP under MinGW:
+if(ENABLE_OPENMP AND MINGW)
+   target_link_libraries(viennacl gomp)
+endif(ENABLE_OPENMP AND MINGW)

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/include/viennacl.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/include/viennacl.hpp b/native-viennaCL/src/main/cpp/libviennacl/include/viennacl.hpp
new file mode 100644
index 0000000..0b7b97a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/include/viennacl.hpp
@@ -0,0 +1,617 @@
+#ifndef VIENNACL_VIENNACL_HPP
+#define VIENNACL_VIENNACL_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <stdlib.h>
+
+#ifdef VIENNACL_WITH_OPENCL
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+#endif
+
+
+// Extra export declarations when building with Visual Studio:
+#if defined(_MSC_VER)
+  #if defined(viennacl_EXPORTS)
+    #define  VIENNACL_EXPORTED_FUNCTION __declspec(dllexport)
+  #else
+    #define  VIENNACL_EXPORTED_FUNCTION __declspec(dllimport)
+  #endif /* viennacl_EXPORTS */
+#else /* defined (_MSC_VER) */
+ #define VIENNACL_EXPORTED_FUNCTION
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int ViennaCLInt;
+
+
+/************** Enums ***************/
+
+typedef enum
+{
+  ViennaCLInvalidBackend, // for catching uninitialized and invalid values
+  ViennaCLCUDA,
+  ViennaCLOpenCL,
+  ViennaCLHost
+} ViennaCLBackendTypes;
+
+typedef enum
+{
+  ViennaCLInvalidOrder,  // for catching uninitialized and invalid values
+  ViennaCLRowMajor,
+  ViennaCLColumnMajor
+} ViennaCLOrder;
+
+typedef enum
+{
+  ViennaCLInvalidTranspose, // for catching uninitialized and invalid values
+  ViennaCLNoTrans,
+  ViennaCLTrans
+} ViennaCLTranspose;
+
+typedef enum
+{
+  ViennaCLInvalidUplo, // for catching uninitialized and invalid values
+  ViennaCLUpper,
+  ViennaCLLower
+} ViennaCLUplo;
+
+typedef enum
+{
+  ViennaCLInvalidDiag, // for catching uninitialized and invalid values
+  ViennaCLUnit,
+  ViennaCLNonUnit
+} ViennaCLDiag;
+
+typedef enum
+{
+  ViennaCLInvalidPrecision,  // for catching uninitialized and invalid values
+  ViennaCLFloat,
+  ViennaCLDouble
+} ViennaCLPrecision;
+
+// Error codes:
+typedef enum
+{
+  ViennaCLSuccess = 0,
+  ViennaCLGenericFailure
+} ViennaCLStatus;
+
+
+/************* Backend Management ******************/
+
+/** @brief Generic backend for CUDA, OpenCL, host-based stuff */
+struct ViennaCLBackend_impl;
+typedef ViennaCLBackend_impl*   ViennaCLBackend;
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendCreate(ViennaCLBackend * backend);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendSetOpenCLContextID(ViennaCLBackend backend, ViennaCLInt context_id);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendDestroy(ViennaCLBackend * backend);
+
+/******** User Types **********/
+
+struct ViennaCLHostScalar_impl;
+typedef ViennaCLHostScalar_impl*    ViennaCLHostScalar;
+
+struct ViennaCLScalar_impl;
+typedef ViennaCLScalar_impl*        ViennaCLScalar;
+
+struct ViennaCLVector_impl;
+typedef ViennaCLVector_impl*        ViennaCLVector;
+
+struct ViennaCLMatrix_impl;
+typedef ViennaCLMatrix_impl*        ViennaCLMatrix;
+
+
+/******************** BLAS Level 1 ***********************/
+
+// IxASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLiamax(ViennaCLInt *alpha, ViennaCLVector x);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAiSamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                             ViennaCLInt *alpha,
+                                                             float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAiDamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                             ViennaCLInt *alpha,
+                                                             double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiSamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                               ViennaCLInt *alpha,
+                                                               cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiDamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                               ViennaCLInt *alpha,
+                                                               cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiSamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                             ViennaCLInt *alpha,
+                                                             float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiDamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                             ViennaCLInt *alpha,
+                                                             double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLasum(ViennaCLHostScalar *alpha, ViennaCLVector x);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLaxpy(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLcopy(ViennaCLVector x, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAScopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADcopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLScopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDcopy(ViennaCLBackend backend, ViennaCLInt n,
+                                   cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                   cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostScopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDcopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLdot(ViennaCLHostScalar *alpha, ViennaCLVector x, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           float *alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           double *alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             float *alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             double *alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           float *alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           double *alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLnrm2(ViennaCLHostScalar *alpha, ViennaCLVector x);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLrot(ViennaCLVector     x,     ViennaCLVector y,
+                                                      ViennaCLHostScalar c, ViennaCLHostScalar s);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float c, float s);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double c, double s);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             float c, float s);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             double c, double s);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float c, float s);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double c, double s);
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLscal(ViennaCLHostScalar alpha, ViennaCLVector x);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLswap(ViennaCLVector x, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+
+
+/******************** BLAS Level 2 ***********************/
+
+// xGEMV: y <- alpha * Ax + beta * y
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemv(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLVector x, ViennaCLHostScalar beta, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASgemv(ViennaCLBackend backend,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float beta,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADgemv(ViennaCLBackend backend,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double beta,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemv(ViennaCLBackend backend,
+                                                              ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt m, ViennaCLInt n, float alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              float beta,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemv(ViennaCLBackend backend,
+                                                              ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt m, ViennaCLInt n, double alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              double beta,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemv(ViennaCLBackend backend,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float beta,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemv(ViennaCLBackend backend,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double beta,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+// xTRSV: Ax <- x
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsv(ViennaCLMatrix A, ViennaCLVector x, ViennaCLUplo uplo);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAStrsv(ViennaCLBackend backend,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADtrsv(ViennaCLBackend backend,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLStrsv(ViennaCLBackend backend,
+                                                              ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                              ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDtrsv(ViennaCLBackend backend,
+                                                              ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                              ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostStrsv(ViennaCLBackend backend,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDtrsv(ViennaCLBackend backend,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+// xGER: A <- alpha * x * y + A
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLger(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y, ViennaCLMatrix A);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASger(ViennaCLBackend backend,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           float alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADger(ViennaCLBackend backend,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m,  ViennaCLInt n,
+                                                           double alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSger(ViennaCLBackend backend,
+                                                             ViennaCLOrder order,
+                                                             ViennaCLInt m, ViennaCLInt n,
+                                                             float alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDger(ViennaCLBackend backend,
+                                                             ViennaCLOrder order,
+                                                             ViennaCLInt m, ViennaCLInt n,
+                                                             double alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSger(ViennaCLBackend backend,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           float alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDger(ViennaCLBackend backend,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           double alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+
+
+
+/******************** BLAS Level 3 ***********************/
+
+// xGEMM: C <- alpha * AB + beta * C
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemm(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLMatrix B, ViennaCLHostScalar beta, ViennaCLMatrix C);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            float alpha,
+                                                            float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            float beta,
+                                                            float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            double alpha,
+                                                            double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            double beta,
+                                                            double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemm(ViennaCLBackend backend,
+                                                              ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                              ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                              ViennaCLOrder orderC,
+                                                              ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                              float alpha,
+                                                              cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                              float beta,
+                                                              cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemm(ViennaCLBackend backend,
+                                                              ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                              ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                              ViennaCLOrder orderC,
+                                                              ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                              double alpha,
+                                                              cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                              double beta,
+                                                              cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            float alpha,
+                                                            float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            float beta,
+                                                            float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            double alpha,
+                                                            double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            double beta,
+                                                            double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+
+// xTRSM: Triangular solves with multiple right hand sides
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsm(ViennaCLMatrix A, ViennaCLUplo uplo, ViennaCLDiag diag, ViennaCLMatrix B);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/backend.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/backend.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/backend.cpp
new file mode 100644
index 0000000..c9f6bf4
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/backend.cpp
@@ -0,0 +1,46 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendCreate(ViennaCLBackend * backend)
+{
+  *backend = new ViennaCLBackend_impl();
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendSetOpenCLContextID(ViennaCLBackend backend, ViennaCLInt context_id)
+{
+  backend->opencl_backend.context_id = context_id;
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendDestroy(ViennaCLBackend * backend)
+{
+  delete *backend;
+  *backend = NULL;
+
+  return ViennaCLSuccess;
+}
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/backend.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/backend.cu b/native-viennaCL/src/main/cpp/libviennacl/src/backend.cu
new file mode 100644
index 0000000..c9f6bf4
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/backend.cu
@@ -0,0 +1,46 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendCreate(ViennaCLBackend * backend)
+{
+  *backend = new ViennaCLBackend_impl();
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendSetOpenCLContextID(ViennaCLBackend backend, ViennaCLInt context_id)
+{
+  backend->opencl_backend.context_id = context_id;
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendDestroy(ViennaCLBackend * backend)
+{
+  delete *backend;
+  *backend = NULL;
+
+  return ViennaCLSuccess;
+}
+


[08/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/iterative.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/iterative.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/iterative.hpp
new file mode 100644
index 0000000..d24a641
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/iterative.hpp
@@ -0,0 +1,1619 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/vector_proxy.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/scheduler/io.hpp"
+#include "viennacl/scheduler/preset.hpp"
+
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/iterative.hpp
+ *  @brief OpenCL kernel file for specialized iterative solver kernels */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_pipelined_cg_vector_update(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void cg_vector_update( \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  "); source.append(numeric_string); source.append(" alpha, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * r, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const * Ap, \n");
+  source.append("  "); source.append(numeric_string); source.append(" beta, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" value_p = p[i]; \n");
+  source.append("    "); source.append(numeric_string); source.append(" value_r = r[i]; \n");
+  source.append("     \n");
+  source.append("    result[i] += alpha * value_p; \n");
+  source.append("    value_r   -= alpha * Ap[i]; \n");
+  source.append("    value_p    = value_r + beta * value_p; \n");
+  source.append("     \n");
+  source.append("    p[i] = value_p; \n");
+  source.append("    r[i] = value_r; \n");
+  source.append("    inner_prod_contrib += value_r * value_r; \n");
+  source.append("  }  \n");
+
+  // parallel reduction in work group
+  source.append("  shared_array[get_local_id(0)] = inner_prod_contrib; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride)  \n");
+  source.append("      shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride];  \n");
+  source.append("  } ");
+
+  // write results to result array
+  source.append(" if (get_local_id(0) == 0) \n ");
+  source.append("   inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
+
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_pipelined_cg_blocked_prod(StringT & source, std::string const & numeric_string, unsigned int subwarp_size)
+{
+  std::stringstream ss;
+  ss << subwarp_size;
+
+  source.append("__kernel void cg_csr_blocked_prod( \n");
+  source.append("    __global const unsigned int * row_indices, \n");
+  source.append("    __global const unsigned int * column_indices, \n");
+  source.append("    __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("    __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("    __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("    unsigned int size, \n");
+  source.append("    __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("    unsigned int buffer_size, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+  source.append("{ \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" shared_elements[256]; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+
+  source.append("  const unsigned int id_in_row = get_local_id(0) % " + ss.str() + "; \n");
+  source.append("  const unsigned int block_increment = get_local_size(0) * ((size - 1) / (get_global_size(0)) + 1); \n");
+  source.append("  const unsigned int block_start = get_group_id(0) * block_increment; \n");
+  source.append("  const unsigned int block_stop  = min(block_start + block_increment, size); \n");
+
+  source.append("  for (unsigned int row  = block_start + get_local_id(0) / " + ss.str() + "; \n");
+  source.append("                    row  < block_stop; \n");
+  source.append("                    row += get_local_size(0) / " + ss.str() + ") \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+  source.append("    unsigned int row_end = row_indices[row+1]; \n");
+  source.append("    for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + ") \n");
+  source.append("      dot_prod += elements[i] * p[column_indices[i]]; \n");
+
+  source.append("    shared_elements[get_local_id(0)] = dot_prod; \n");
+  source.append("    #pragma unroll \n");
+  source.append("    for (unsigned int k = 1; k < " + ss.str() + "; k *= 2) \n");
+  source.append("      shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
+
+  source.append("    if (id_in_row == 0) { \n");
+  source.append("      Ap[row] = shared_elements[get_local_id(0)]; \n");
+  source.append("      inner_prod_ApAp += shared_elements[get_local_id(0)] * shared_elements[get_local_id(0)]; \n");
+  source.append("      inner_prod_pAp  +=                           p[row] * shared_elements[get_local_id(0)]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+
+  ////////// parallel reduction in work group
+  source.append("  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+  source.append("  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  source.append("      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
+  source.append("    } ");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) { \n ");
+  source.append("    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+  source.append("    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+  source.append("  } \n");
+
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void cg_csr_prod( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const unsigned int * row_blocks, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  unsigned int num_blocks, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int buffer_size, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_elements) \n");
+  source.append("{ \n");
+
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+
+  source.append("  for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
+  source.append("    unsigned int row_start = row_blocks[block_id]; \n");
+  source.append("    unsigned int row_stop  = row_blocks[block_id + 1]; \n");
+  source.append("    unsigned int rows_to_process = row_stop - row_start; \n");
+  source.append("    unsigned int element_start = row_indices[row_start]; \n");
+  source.append("    unsigned int element_stop = row_indices[row_stop]; \n");
+
+  source.append("    if (rows_to_process > 1) { \n"); // CSR stream
+      // load to shared buffer:
+  source.append("      for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+  source.append("        shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
+
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      // use one thread per row to sum:
+  source.append("      for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
+  source.append("        "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+  source.append("        unsigned int thread_row_start = row_indices[row]     - element_start; \n");
+  source.append("        unsigned int thread_row_stop  = row_indices[row + 1] - element_start; \n");
+  source.append("        for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
+  source.append("          dot_prod += shared_elements[i]; \n");
+  source.append("        Ap[row] = dot_prod; \n");
+  source.append("        inner_prod_ApAp += dot_prod * dot_prod; \n");
+  source.append("        inner_prod_pAp  +=   p[row] * dot_prod; \n");
+  source.append("      } \n");
+  source.append("    } \n");
+
+  source.append("    else  \n"); // CSR vector for a single row
+  source.append("    { \n");
+      // load and sum to shared buffer:
+  source.append("      shared_elements[get_local_id(0)] = 0; \n");
+  source.append("      for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+  source.append("        shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
+
+      // reduction to obtain final result
+  source.append("      for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (get_local_id(0) < stride) \n");
+  source.append("          shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
+  source.append("      } \n");
+
+  source.append("      if (get_local_id(0) == 0) { \n");
+  source.append("        Ap[row_start] = shared_elements[0]; \n");
+  source.append("        inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
+  source.append("        inner_prod_pAp  +=       p[row_start] * shared_elements[0]; \n");
+  source.append("      } \n");
+  source.append("    } \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("  } \n");
+
+  // parallel reduction in work group
+  source.append("  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+  source.append("  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  source.append("      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
+  source.append("    } ");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) { \n ");
+  source.append("    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+  source.append("    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+  source.append("  } \n");
+
+  source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_coordinate_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void cg_coo_prod( \n");
+  source.append("  __global const uint2 * coords,  \n");//(row_index, column_index)
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const uint  * group_boundaries, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  __local unsigned int * shared_rows, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * inter_results, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int buffer_size, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+
+  ///////////// Sparse matrix-vector multiplication part /////////////
+  source.append("  uint2 tmp; \n");
+  source.append("  "); source.append(numeric_string); source.append(" val; \n");
+  source.append("  uint group_start = group_boundaries[get_group_id(0)]; \n");
+  source.append("  uint group_end   = group_boundaries[get_group_id(0) + 1]; \n");
+  source.append("  uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");   // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
+
+  source.append("  uint local_index = 0; \n");
+
+  source.append("  for (uint k = 0; k < k_end; ++k) { \n");
+  source.append("    local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
+
+  source.append("    tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
+  source.append("    val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
+
+  //check for carry from previous loop run:
+  source.append("    if (get_local_id(0) == 0 && k > 0) { \n");
+  source.append("      if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
+  source.append("        val += inter_results[get_local_size(0)-1]; \n");
+  source.append("      else {\n");
+  source.append("        "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_size(0)-1]; \n");
+  source.append("        Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
+  source.append("        inner_prod_ApAp += Ap_entry * Ap_entry; \n");
+  source.append("        inner_prod_pAp  += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
+  source.append("      } \n");
+  source.append("    } \n");
+
+  //segmented parallel reduction begin
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    shared_rows[get_local_id(0)] = tmp.x; \n");
+  source.append("    inter_results[get_local_id(0)] = val; \n");
+  source.append("    "); source.append(numeric_string); source.append(" left = 0; \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+  source.append("      left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("      inter_results[get_local_id(0)] += left; \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    } \n");
+  //segmented parallel reduction end
+
+  source.append("    if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
+  source.append("      shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
+  source.append("      "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
+  source.append("      Ap[tmp.x] = Ap_entry; \n");
+  source.append("      inner_prod_ApAp += Ap_entry * Ap_entry; \n");
+  source.append("      inner_prod_pAp  += p[tmp.x] * Ap_entry; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("  }  \n"); //for k
+
+  source.append("  if (local_index + 1 == group_end) {\n");  //write results of last active entry (this may not necessarily be the case already)
+  source.append("    "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
+  source.append("    Ap[tmp.x] = Ap_entry; \n");
+  source.append("    inner_prod_ApAp += Ap_entry * Ap_entry; \n");
+  source.append("    inner_prod_pAp  += p[tmp.x] * Ap_entry; \n");
+  source.append("  }  \n");
+
+  //////////// parallel reduction of inner product contributions within work group ///////////////
+  source.append("  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+  source.append("  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  source.append("      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
+  source.append("    } ");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) { \n ");
+  source.append("    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+  source.append("    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+  source.append("  } \n");
+
+  source.append("} \n \n");
+
+}
+
+
+template<typename StringT>
+void generate_ell_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void cg_ell_prod( \n");
+  source.append("  __global const unsigned int * coords, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  unsigned int internal_row_num, \n");
+  source.append("  unsigned int items_per_row, \n");
+  source.append("  unsigned int aligned_items_per_row, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int buffer_size, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+  source.append("  uint glb_id = get_global_id(0); \n");
+  source.append("  uint glb_sz = get_global_size(0); \n");
+
+  source.append("  for (uint row = glb_id; row < size; row += glb_sz) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+  source.append("    uint offset = row; \n");
+  source.append("    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+  source.append("      "); source.append(numeric_string); source.append(" val = elements[offset]; \n");
+  source.append("      sum += (val != 0) ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(")0; \n");
+  source.append("    } \n");
+
+  source.append("    Ap[row] = sum; \n");
+  source.append("    inner_prod_ApAp += sum * sum; \n");
+  source.append("    inner_prod_pAp  += p[row] * sum; \n");
+  source.append("  }  \n");
+
+  //////////// parallel reduction of inner product contributions within work group ///////////////
+  source.append("  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+  source.append("  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  source.append("      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
+  source.append("    } ");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) { \n ");
+  source.append("    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+  source.append("    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+  source.append("  } \n");
+  source.append("} \n \n");
+}
+
+template<typename StringT>
+void generate_sliced_ell_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void cg_sliced_ell_prod( \n");
+  source.append("  __global const unsigned int * columns_per_block, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const unsigned int * block_start, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  unsigned int block_size, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int buffer_size, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+  source.append("  uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
+  source.append("  uint id_in_block = get_local_id(0) % block_size; \n");
+  source.append("  uint num_blocks  = (size - 1) / block_size + 1; \n");
+  source.append("  uint global_warp_count  = blocks_per_workgroup * get_num_groups(0); \n");
+  source.append("  uint global_warp_id     = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
+
+  source.append("  for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+  source.append("    uint row    = block_idx * block_size + id_in_block; \n");
+  source.append("    uint offset = block_start[block_idx]; \n");
+  source.append("    uint num_columns = columns_per_block[block_idx]; \n");
+  source.append("    for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
+  source.append("      uint index = offset + item_id * block_size + id_in_block; \n");
+  source.append("      "); source.append(numeric_string); source.append(" val = elements[index]; \n");
+  source.append("      sum += (val != 0) ? (p[column_indices[index]] * val) : 0; \n");
+  source.append("    } \n");
+
+  source.append("    if (row < size) {\n");
+  source.append("      Ap[row] = sum; \n");
+  source.append("      inner_prod_ApAp += sum * sum; \n");
+  source.append("      inner_prod_pAp  += p[row] * sum; \n");
+  source.append("    }  \n");
+  source.append("  }  \n");
+
+  //////////// parallel reduction of inner product contributions within work group ///////////////
+  source.append("  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+  source.append("  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  source.append("      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
+  source.append("    } ");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) { \n ");
+  source.append("    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+  source.append("    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+  source.append("  } \n");
+  source.append("} \n \n");
+}
+
+template<typename StringT>
+void generate_hyb_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void cg_hyb_prod( \n");
+  source.append("  const __global int* ell_coords, \n");
+  source.append("  const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
+  source.append("  const __global uint* csr_rows, \n");
+  source.append("  const __global uint* csr_cols, \n");
+  source.append("  const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
+  source.append("  unsigned int internal_row_num, \n");
+  source.append("  unsigned int items_per_row, \n");
+  source.append("  unsigned int aligned_items_per_row, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int buffer_size, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+  source.append("  uint glb_id = get_global_id(0); \n");
+  source.append("  uint glb_sz = get_global_size(0); \n");
+
+  source.append("  for (uint row = glb_id; row < size; row += glb_sz) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+  source.append("    uint offset = row; \n");
+  source.append("    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+  source.append("      "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
+  source.append("      sum += (val != 0) ? (p[ell_coords[offset]] * val) : 0; \n");
+  source.append("    } \n");
+
+  source.append("    uint col_begin = csr_rows[row]; \n");
+  source.append("    uint col_end   = csr_rows[row + 1]; \n");
+
+  source.append("    for (uint item_id = col_begin; item_id < col_end; item_id++) {  \n");
+  source.append("      sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
+  source.append("    } \n");
+
+  source.append("    Ap[row] = sum; \n");
+  source.append("    inner_prod_ApAp += sum * sum; \n");
+  source.append("    inner_prod_pAp  += p[row] * sum; \n");
+  source.append("  }  \n");
+
+  //////////// parallel reduction of inner product contributions within work group ///////////////
+  source.append("  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+  source.append("  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  source.append("      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
+  source.append("    } ");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) { \n ");
+  source.append("    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+  source.append("    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+  source.append("  } \n");
+  source.append("} \n \n");
+}
+
+
+//////////////////////////////////////////////////////
+
+
+template<typename StringT>
+void generate_pipelined_bicgstab_update_s(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void bicgstab_update_s( \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * s, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const * r, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const * Ap, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int chunk_size, \n");
+  source.append("  unsigned int chunk_offset, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_Ap_in_r0) \n");
+  source.append("{ \n");
+
+  source.append("  "); source.append(numeric_string); source.append(" alpha = 0; \n");
+
+  // parallel reduction in work group to compute <r, r0> / <Ap, r0>
+  source.append("  shared_array[get_local_id(0)]  = inner_prod_buffer[get_local_id(0)]; \n");
+  source.append("  shared_array_Ap_in_r0[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + 3 * chunk_size]; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  source.append("      shared_array[get_local_id(0)]  += shared_array[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_Ap_in_r0[get_local_id(0)] += shared_array_Ap_in_r0[get_local_id(0) + stride];  \n");
+  source.append("    } ");
+  source.append("  } ");
+
+  // compute alpha from reduced values:
+  source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("  alpha = shared_array[0] / shared_array_Ap_in_r0[0]; ");
+
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" value_s = s[i]; \n");
+  source.append("     \n");
+  source.append("    value_s = r[i] - alpha * Ap[i]; \n");
+  source.append("    inner_prod_contrib += value_s * value_s; \n");
+  source.append("     \n");
+  source.append("    s[i] = value_s; \n");
+  source.append("  }  \n");
+  source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  // parallel reduction in work group
+  source.append("  shared_array[get_local_id(0)] = inner_prod_contrib; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride)  \n");
+  source.append("      shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride];  \n");
+  source.append("  } ");
+
+  // write results to result array
+  source.append(" if (get_local_id(0) == 0) \n ");
+  source.append("   inner_prod_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
+
+  source.append("} \n");
+
+}
+
+
+
+template<typename StringT>
+void generate_pipelined_bicgstab_vector_update(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void bicgstab_vector_update( \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  "); source.append(numeric_string); source.append(" alpha, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  "); source.append(numeric_string); source.append(" omega, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const * s, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * residual, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const * As, \n");
+  source.append("  "); source.append(numeric_string); source.append(" beta, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const * Ap, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" const * r0star, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_r_r0star = 0; \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" value_result = result[i]; \n");
+  source.append("    "); source.append(numeric_string); source.append(" value_p = p[i]; \n");
+  source.append("    "); source.append(numeric_string); source.append(" value_s = s[i]; \n");
+  source.append("    "); source.append(numeric_string); source.append(" value_residual = residual[i]; \n");
+  source.append("    "); source.append(numeric_string); source.append(" value_As = As[i]; \n");
+  source.append("    "); source.append(numeric_string); source.append(" value_Ap = Ap[i]; \n");
+  source.append("    "); source.append(numeric_string); source.append(" value_r0star = r0star[i]; \n");
+  source.append("     \n");
+  source.append("    value_result += alpha * value_p + omega * value_s; \n");
+  source.append("    value_residual  = value_s - omega * value_As; \n");
+  source.append("    value_p         = value_residual + beta * (value_p - omega * value_Ap); \n");
+  source.append("     \n");
+  source.append("    result[i]   = value_result; \n");
+  source.append("    residual[i] = value_residual; \n");
+  source.append("    p[i]        = value_p; \n");
+  source.append("    inner_prod_r_r0star += value_residual * value_r0star; \n");
+  source.append("  }  \n");
+
+  // parallel reduction in work group
+  source.append("  shared_array[get_local_id(0)] = inner_prod_r_r0star; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride)  \n");
+  source.append("      shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride];  \n");
+  source.append("  } ");
+
+  // write results to result array
+  source.append(" if (get_local_id(0) == 0) \n ");
+  source.append("   inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
+
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_pipelined_bicgstab_blocked_prod(StringT & source, std::string const & numeric_string, unsigned int subwarp_size)
+{
+  std::stringstream ss;
+  ss << subwarp_size;
+
+  source.append("__kernel void bicgstab_csr_blocked_prod( \n");
+  source.append("    __global const unsigned int * row_indices, \n");
+  source.append("    __global const unsigned int * column_indices, \n");
+  source.append("    __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("    __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("    __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * r0star, \n");
+  source.append("    unsigned int size, \n");
+  source.append("    __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int buffer_size, \n");
+  source.append("  unsigned int buffer_offset, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
+  source.append("{ \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" shared_elements[256]; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
+
+  source.append("  const unsigned int id_in_row = get_local_id(0) % " + ss.str() + "; \n");
+  source.append("  const unsigned int block_increment = get_local_size(0) * ((size - 1) / (get_global_size(0)) + 1); \n");
+  source.append("  const unsigned int block_start = get_group_id(0) * block_increment; \n");
+  source.append("  const unsigned int block_stop  = min(block_start + block_increment, size); \n");
+
+  source.append("  for (unsigned int row  = block_start + get_local_id(0) / " + ss.str() + "; \n");
+  source.append("                    row  < block_stop; \n");
+  source.append("                    row += get_local_size(0) / " + ss.str() + ") \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+  source.append("    unsigned int row_end = row_indices[row+1]; \n");
+  source.append("    for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + ") \n");
+  source.append("      dot_prod += elements[i] * p[column_indices[i]]; \n");
+
+  source.append("    shared_elements[get_local_id(0)] = dot_prod; \n");
+  source.append("    #pragma unroll \n");
+  source.append("    for (unsigned int k = 1; k < " + ss.str() + "; k *= 2) \n");
+  source.append("      shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
+
+  source.append("    if (id_in_row == 0) { \n");
+  source.append("      Ap[row] = shared_elements[get_local_id(0)]; \n");
+  source.append("      inner_prod_ApAp += shared_elements[get_local_id(0)] * shared_elements[get_local_id(0)]; \n");
+  source.append("      inner_prod_pAp  +=                           p[row] * shared_elements[get_local_id(0)]; \n");
+  source.append("      inner_prod_r0Ap +=                      r0star[row] * shared_elements[get_local_id(0)]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+
+  // parallel reduction in work group
+  source.append("  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+  source.append("  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
+  source.append("  shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  source.append("      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_r0Ap[get_local_id(0)]  += shared_array_r0Ap[get_local_id(0) + stride];  \n");
+  source.append("    } ");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) { \n ");
+  source.append("    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+  source.append("    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+  source.append("    inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
+  source.append("  } \n");
+
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void bicgstab_csr_prod( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const unsigned int * row_blocks, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  unsigned int num_blocks, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * r0star, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int buffer_size, \n");
+  source.append("  unsigned int buffer_offset, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
+  source.append("{ \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" shared_elements[1024]; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
+
+  source.append("  for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
+  source.append("    unsigned int row_start = row_blocks[block_id]; \n");
+  source.append("    unsigned int row_stop  = row_blocks[block_id + 1]; \n");
+  source.append("    unsigned int rows_to_process = row_stop - row_start; \n");
+  source.append("    unsigned int element_start = row_indices[row_start]; \n");
+  source.append("    unsigned int element_stop = row_indices[row_stop]; \n");
+
+  source.append("    if (rows_to_process > 1) { \n"); // CSR stream
+      // load to shared buffer:
+  source.append("      for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+  source.append("        shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
+
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      // use one thread per row to sum:
+  source.append("      for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
+  source.append("        "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+  source.append("        unsigned int thread_row_start = row_indices[row]     - element_start; \n");
+  source.append("        unsigned int thread_row_stop  = row_indices[row + 1] - element_start; \n");
+  source.append("        for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
+  source.append("          dot_prod += shared_elements[i]; \n");
+  source.append("        Ap[row] = dot_prod; \n");
+  source.append("        inner_prod_ApAp += dot_prod * dot_prod; \n");
+  source.append("        inner_prod_pAp  +=   p[row] * dot_prod; \n");
+  source.append("        inner_prod_r0Ap  += r0star[row] * dot_prod; \n");
+  source.append("      } \n");
+  source.append("    } \n");
+
+  source.append("    else  \n"); // CSR vector for a single row
+  source.append("    { \n");
+      // load and sum to shared buffer:
+  source.append("      shared_elements[get_local_id(0)] = 0; \n");
+  source.append("      for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+  source.append("        shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
+
+      // reduction to obtain final result
+  source.append("      for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (get_local_id(0) < stride) \n");
+  source.append("          shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
+  source.append("      } \n");
+
+  source.append("      if (get_local_id(0) == 0) { \n");
+  source.append("        Ap[row_start] = shared_elements[0]; \n");
+  source.append("        inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
+  source.append("        inner_prod_pAp  +=       p[row_start] * shared_elements[0]; \n");
+  source.append("        inner_prod_r0Ap +=  r0star[row_start] * shared_elements[0]; \n");
+  source.append("      } \n");
+  source.append("    } \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("  } \n");
+
+  // parallel reduction in work group
+  source.append("  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+  source.append("  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
+  source.append("  shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  source.append("      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_r0Ap[get_local_id(0)]  += shared_array_r0Ap[get_local_id(0) + stride];  \n");
+  source.append("    } ");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) { \n ");
+  source.append("    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+  source.append("    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+  source.append("    inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
+  source.append("  } \n");
+
+  source.append("} \n \n");
+
+}
+
+template<typename StringT>
+void generate_coordinate_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void bicgstab_coo_prod( \n");
+  source.append("  __global const uint2 * coords,  \n");//(row_index, column_index)
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const uint  * group_boundaries, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * r0star, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  __local unsigned int * shared_rows, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * inter_results, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int buffer_size, \n");
+  source.append("  unsigned int buffer_offset, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
+
+  ///////////// Sparse matrix-vector multiplication part /////////////
+  source.append("  uint2 tmp; \n");
+  source.append("  "); source.append(numeric_string); source.append(" val; \n");
+  source.append("  uint group_start = group_boundaries[get_group_id(0)]; \n");
+  source.append("  uint group_end   = group_boundaries[get_group_id(0) + 1]; \n");
+  source.append("  uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");   // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
+
+  source.append("  uint local_index = 0; \n");
+
+  source.append("  for (uint k = 0; k < k_end; ++k) { \n");
+  source.append("    local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
+
+  source.append("    tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
+  source.append("    val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
+
+  //check for carry from previous loop run:
+  source.append("    if (get_local_id(0) == 0 && k > 0) { \n");
+  source.append("      if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
+  source.append("        val += inter_results[get_local_size(0)-1]; \n");
+  source.append("      else {\n");
+  source.append("        "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_size(0)-1]; \n");
+  source.append("        Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
+  source.append("        inner_prod_ApAp += Ap_entry * Ap_entry; \n");
+  source.append("        inner_prod_pAp  += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
+  source.append("        inner_prod_r0Ap  += r0star[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
+  source.append("      } \n");
+  source.append("    } \n");
+
+  //segmented parallel reduction begin
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    shared_rows[get_local_id(0)] = tmp.x; \n");
+  source.append("    inter_results[get_local_id(0)] = val; \n");
+  source.append("    "); source.append(numeric_string); source.append(" left = 0; \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+  source.append("      left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("      inter_results[get_local_id(0)] += left; \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    } \n");
+  //segmented parallel reduction end
+
+  source.append("    if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
+  source.append("      shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
+  source.append("      "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
+  source.append("      Ap[tmp.x] = Ap_entry; \n");
+  source.append("      inner_prod_ApAp += Ap_entry * Ap_entry; \n");
+  source.append("      inner_prod_pAp  += p[tmp.x] * Ap_entry; \n");
+  source.append("      inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("  }  \n"); //for k
+
+  source.append("  if (local_index + 1 == group_end) {\n");  //write results of last active entry (this may not necessarily be the case already)
+  source.append("    "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
+  source.append("    Ap[tmp.x] = Ap_entry; \n");
+  source.append("    inner_prod_ApAp += Ap_entry * Ap_entry; \n");
+  source.append("    inner_prod_pAp  += p[tmp.x] * Ap_entry; \n");
+  source.append("    inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
+  source.append("  }  \n");
+
+  // parallel reduction in work group
+  source.append("  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+  source.append("  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
+  source.append("  shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  source.append("      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_r0Ap[get_local_id(0)]  += shared_array_r0Ap[get_local_id(0) + stride];  \n");
+  source.append("    } ");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) { \n ");
+  source.append("    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+  source.append("    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+  source.append("    inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
+  source.append("  } \n");
+
+  source.append("} \n \n");
+
+}
+
+
+template<typename StringT>
+void generate_ell_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void bicgstab_ell_prod( \n");
+  source.append("  __global const unsigned int * coords, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  unsigned int internal_row_num, \n");
+  source.append("  unsigned int items_per_row, \n");
+  source.append("  unsigned int aligned_items_per_row, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * r0star, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int buffer_size, \n");
+  source.append("  unsigned int buffer_offset, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
+  source.append("  uint glb_id = get_global_id(0); \n");
+  source.append("  uint glb_sz = get_global_size(0); \n");
+
+  source.append("  for (uint row = glb_id; row < size; row += glb_sz) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+  source.append("    uint offset = row; \n");
+  source.append("    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+  source.append("      "); source.append(numeric_string); source.append(" val = elements[offset]; \n");
+  source.append("      sum += (val != 0) ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(")0; \n");
+  source.append("    } \n");
+
+  source.append("    Ap[row] = sum; \n");
+  source.append("    inner_prod_ApAp += sum * sum; \n");
+  source.append("    inner_prod_pAp  += p[row] * sum; \n");
+  source.append("    inner_prod_r0Ap += r0star[row] * sum; \n");
+  source.append("  }  \n");
+
+  // parallel reduction in work group
+  source.append("  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+  source.append("  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
+  source.append("  shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  source.append("      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride];  \n");
+  source.append("    } ");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) { \n ");
+  source.append("    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+  source.append("    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+  source.append("    inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
+  source.append("  } \n");
+  source.append("} \n \n");
+}
+
+template<typename StringT>
+void generate_sliced_ell_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void bicgstab_sliced_ell_prod( \n");
+  source.append("  __global const unsigned int * columns_per_block, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const unsigned int * block_start, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * r0star, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  unsigned int block_size, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int buffer_size, \n");
+  source.append("  unsigned int buffer_offset, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
+  source.append("  uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
+  source.append("  uint id_in_block = get_local_id(0) % block_size; \n");
+  source.append("  uint num_blocks  = (size - 1) / block_size + 1; \n");
+  source.append("  uint global_warp_count  = blocks_per_workgroup * get_num_groups(0); \n");
+  source.append("  uint global_warp_id     = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
+
+  source.append("  for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+  source.append("    uint row    = block_idx * block_size + id_in_block; \n");
+  source.append("    uint offset = block_start[block_idx]; \n");
+  source.append("    uint num_columns = columns_per_block[block_idx]; \n");
+  source.append("    for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
+  source.append("      uint index = offset + item_id * block_size + id_in_block; \n");
+  source.append("      "); source.append(numeric_string); source.append(" val = elements[index]; \n");
+  source.append("      sum += (val != 0) ? (p[column_indices[index]] * val) : 0; \n");
+  source.append("    } \n");
+
+  source.append("    if (row < size) {\n");
+  source.append("      Ap[row] = sum; \n");
+  source.append("      inner_prod_ApAp += sum * sum; \n");
+  source.append("      inner_prod_pAp  += p[row] * sum; \n");
+  source.append("      inner_prod_r0Ap += r0star[row] * sum; \n");
+  source.append("    }  \n");
+  source.append("  }  \n");
+
+  // parallel reduction in work group
+  source.append("  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+  source.append("  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
+  source.append("  shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  source.append("      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride];  \n");
+  source.append("    } ");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) { \n ");
+  source.append("    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+  source.append("    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+  source.append("    inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
+  source.append("  } \n");
+  source.append("} \n \n");
+}
+
+template<typename StringT>
+void generate_hyb_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void bicgstab_hyb_prod( \n");
+  source.append("  const __global int* ell_coords, \n");
+  source.append("  const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
+  source.append("  const __global uint* csr_rows, \n");
+  source.append("  const __global uint* csr_cols, \n");
+  source.append("  const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
+  source.append("  unsigned int internal_row_num, \n");
+  source.append("  unsigned int items_per_row, \n");
+  source.append("  unsigned int aligned_items_per_row, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * r0star, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int buffer_size, \n");
+  source.append("  unsigned int buffer_offset, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+  source.append("   __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
+  source.append("  uint glb_id = get_global_id(0); \n");
+  source.append("  uint glb_sz = get_global_size(0); \n");
+
+  source.append("  for (uint row = glb_id; row < size; row += glb_sz) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+  source.append("    uint offset = row; \n");
+  source.append("    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+  source.append("      "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
+  source.append("      sum += (val != 0) ? (p[ell_coords[offset]] * val) : 0; \n");
+  source.append("    } \n");
+
+  source.append("    uint col_begin = csr_rows[row]; \n");
+  source.append("    uint col_end   = csr_rows[row + 1]; \n");
+
+  source.append("    for (uint item_id = col_begin; item_id < col_end; item_id++) {  \n");
+  source.append("      sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
+  source.append("    } \n");
+
+  source.append("    Ap[row] = sum; \n");
+  source.append("    inner_prod_ApAp += sum * sum; \n");
+  source.append("    inner_prod_pAp  += p[row] * sum; \n");
+  source.append("    inner_prod_r0Ap += r0star[row] * sum; \n");
+  source.append("  }  \n");
+
+  // parallel reduction in work group
+  source.append("  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+  source.append("  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
+  source.append("  shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) { \n");
+  source.append("      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
+  source.append("      shared_array_r0Ap[get_local_id(0)]  += shared_array_r0Ap[get_local_id(0) + stride];  \n");
+  source.append("    } ");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) { \n ");
+  source.append("    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+  source.append("    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+  source.append("    inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
+  source.append("  } \n");
+  source.append("} \n \n");
+}
+
+//////////////////////////////
+
+
+template <typename StringType>
+void generate_pipelined_gmres_gram_schmidt_stage1(StringType & source, std::string const & numeric_string, bool is_nvidia)
+{
+  source.append("__kernel void gmres_gram_schmidt_1( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" const * krylov_basis, \n");
+  source.append("          unsigned int size, \n");
+  source.append("          unsigned int internal_size, \n");
+  source.append("          unsigned int k, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * vi_in_vk_buffer, \n");
+  source.append("          unsigned int chunk_size) \n");
+  source.append("{ \n");
+
+  source.append("  __local "); source.append(numeric_string); source.append(" shared_array[7*128]; \n");
+  if (!is_nvidia)  // use of thread-local variables entails a 2x performance drop on NVIDIA GPUs, but is faster an AMD
+  {
+    source.append("  "); source.append(numeric_string); source.append(" vi_in_vk[7]; \n");
+  }
+  source.append("  "); source.append(numeric_string); source.append(" value_vk = 0; \n");
+
+  source.append("  unsigned int k_base = 0;   \n");
+  source.append("  while (k_base < k) {   \n");
+  source.append("    unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);   \n");
+
+  if (is_nvidia)
+  {
+    source.append("    for (uint j=0; j<vecs_in_iteration; ++j) \n");
+    source.append("      shared_array[get_local_id(0) + j*chunk_size] = 0; \n");
+  }
+  else
+  {
+    source.append("    vi_in_vk[0] = 0;\n");
+    source.append("    vi_in_vk[1] = 0;\n");
+    source.append("    vi_in_vk[2] = 0;\n");
+    source.append("    vi_in_vk[3] = 0;\n");
+    source.append("    vi_in_vk[4] = 0;\n");
+    source.append("    vi_in_vk[5] = 0;\n");
+    source.append("    vi_in_vk[6] = 0;\n");
+  }
+  source.append("    for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+  source.append("      value_vk = krylov_basis[i + k * internal_size]; \n");
+  source.append("       \n");
+  source.append("      for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
+  if (is_nvidia)
+    source.append("        shared_array[get_local_id(0) + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
+  else
+    source.append("        vi_in_vk[j] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
+  source.append("    }  \n");
+
+  // parallel reduction in work group
+  if (!is_nvidia)
+  {
+    source.append("    for (uint j=0; j<vecs_in_iteration; ++j) \n");
+    source.append("      shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk[j]; \n");
+  }
+  source.append("    for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("    { \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("      if (get_local_id(0) < stride) { \n");
+  source.append("        for (uint j=0; j<vecs_in_iteration; ++j) \n");
+  source.append("          shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride];  \n");
+  source.append("      } ");
+  source.append("    } ");
+
+  // write results to result array
+  source.append("    if (get_local_id(0) == 0) \n ");
+  source.append("      for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
+  source.append("        vi_in_vk_buffer[get_group_id(0) + (k_base + j) * chunk_size] = shared_array[j*chunk_size]; ");
+
+  source.append("    k_base += vecs_in_iteration;   \n");
+  source.append("  }  \n");
+
+  source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_pipelined_gmres_gram_schmidt_stage2(StringType & source, std::string const & numeric_string)
+{
+  source.append("__kernel void gmres_gram_schmidt_2( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * krylov_basis, \n");
+  source.append("          unsigned int size, \n");
+  source.append("          unsigned int internal_size, \n");
+  source.append("          unsigned int k, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" const * vi_in_vk_buffer, \n");
+  source.append("          unsigned int chunk_size, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * R_buffer, \n");
+  source.append("          unsigned int krylov_dim, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("         __local "); source.append(numeric_string); source.append(" * shared_array) \n");
+  source.append("{ \n");
+
+  source.append("  "); source.append(numeric_string); source.append(" vk_dot_vk = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" value_vk = 0; \n");
+
+  source.append("  unsigned int k_base = 0;   \n");
+  source.append("  while (k_base < k) {   \n");
+  source.append("    unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);   \n");
+
+  // parallel reduction in work group for <v_i, v_k>
+  source.append("    for (uint j=0; j<vecs_in_iteration; ++j) \n");
+  source.append("      shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk_buffer[get_local_id(0) + (k_base + j) * chunk_size]; \n");
+  source.append("    for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("    { \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("      if (get_local_id(0) < stride) { \n");
+  source.append("        for (uint j=0; j<vecs_in_iteration; ++j) \n");
+  source.append("          shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride];  \n");
+  source.append("      } ");
+  source.append("    } ");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  // v_k -= <v_i, v_k> v_i:
+  source.append("    for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+  source.append("      value_vk = krylov_basis[i + k * internal_size]; \n");
+  source.append("       \n");
+  source.append("      for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
+  source.append("        value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size]; \n");
+  source.append("      vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0;  \n");
+  source.append("      krylov_basis[i + k * internal_size] = value_vk;  \n");
+  source.append("    }  \n");
+
+  // write to R: (to avoid thread divergence, all threads write the same value)
+  source.append("    if (get_group_id(0) == 0) \n");
+  source.append("      for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
+  source.append("        R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size]; ");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    k_base += vecs_in_iteration;   \n");
+  source.append("  }  \n");
+
+  // parallel reduction in work group for <v_k, v_k>
+  source.append("  shared_array[get_local_id(0)] = vk_dot_vk; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) \n");
+  source.append("      shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride];  \n");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) \n ");
+  source.append("    inner_prod_buffer[chunk_size+get_group_id(0)] = shared_array[0]; ");
+
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_pipelined_gmres_normalize_vk(StringType & source, std::string const & numeric_string)
+{
+  source.append("__kernel void gmres_normalize_vk( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * vk, \n");
+  source.append("          unsigned int vk_offset, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" const * residual, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * R_buffer, \n");
+  source.append("          unsigned int R_offset, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" const * inner_prod_buffer, \n");
+  source.append("          unsigned int chunk_size, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * r_dot_vk_buffer, \n");
+  source.append("          unsigned int chunk_offset, \n");
+  source.append("          unsigned int size, \n");
+  source.append("         __local "); source.append(numeric_string); source.append(" * shared_array) \n");
+  source.append("{ \n");
+
+  source.append("  "); source.append(numeric_string); source.append(" norm_vk = 0; \n");
+
+  // parallel reduction in work group to compute <vk, vk>
+  source.append("  shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + chunk_size]; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) \n");
+  source.append("      shared_array[get_local_id(0)]  += shared_array[get_local_id(0) + stride];  \n");
+  source.append("  } ");
+
+  // compute alpha from reduced values:
+  source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("  norm_vk = sqrt(shared_array[0]); \n");
+
+  source.append("  "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" value_vk = vk[i + vk_offset] / norm_vk; \n");
+  source.append("     \n");
+  source.append("    inner_prod_contrib += residual[i] * value_vk; \n");
+  source.append("     \n");
+  source.append("    vk[i + vk_offset] = value_vk; \n");
+  source.append("  }  \n");
+  source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  // parallel reduction in work group
+  source.append("  shared_array[get_local_id(0)] = inner_prod_contrib; \n");
+  source.append("  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+  source.append("  { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride)  \n");
+  source.append("      shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride];  \n");
+  source.append("  } ");
+
+  // write results to result array
+  source.append("  if (get_local_id(0) == 0) \n ");
+  source.append("    r_dot_vk_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
+  source.append("  if (get_global_id(0) == 0) \n ");
+  source.append("    R_buffer[R_offset] = norm_vk; \n");
+
+  source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_pipelined_gmres_update_result(StringType & source, std::string const & numeric_string)
+{
+  source.append("__kernel void gmres_update_result( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" const * residual, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" const * krylov_basis, \n");
+  source.append("          unsigned int size, \n");
+  source.append("          unsigned int internal_size, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" const * coefficients, \n");
+  source.append("          unsigned int k) \n");
+  source.append("{ \n");
+
+  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" value_result = result[i] + coefficients[0] * residual[i]; \n");
+  source.append("     \n");
+  source.append("    for (unsigned int j = 1; j < k; ++j) \n");
+  source.append("      value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size]; \n");
+  source.append("     \n");
+  source.append("    result[i] = value_result; \n");
+  source.append("  }  \n");
+
+  source.append("} \n");
+}
+
+
+template <typename StringType>
+void generate_compressed_matrix_pipelined_gmres_blocked_prod(StringType & source, std::string const & numeric_string)
+{
+  source.append("__kernel void gmres_csr_blocked_prod( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  unsigned int offset_p, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("  unsigned int offset_Ap, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int buffer_size, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+  source.append("{ \n");
+  source.append("  cg_csr_blocked_prod(row_indices, column_indices, elements, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
+  source.append("} \n \n");
+
+}
+
+template <typename StringType>
+void generate_compressed_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
+{
+  source.append("__kernel void gmres_csr_prod( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const unsigned int * row_blocks, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  unsigned int num_blocks, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("  unsigned int offset_p, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("  unsigned int offset_Ap, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("  unsigned int buffer_size, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" * shared_elements) \n");
+  source.append("{ \n");
+  source.append("  cg_csr_prod(row_indices, column_indices, row_blocks, elements, num_blocks, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp, shared_elements); \n");
+  source.append("} \n \n");
+
+}
+
+template <typename StringType>
+void generate_coordinate_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
+{
+  source.append("__kernel void gmres_coo_prod( \n");
+  source.append("          __global const uint2 * coords,  \n");//(row_index, column_index)
+  source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("          __global const uint  * group_boundaries, \n");
+  source.append("          __global const "); source.append(numeric_string); source.append(" * p, \n");
+  source.append("          unsigned int offset_p, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * Ap, \n");
+  source.append("          unsigned int offset_Ap, \n");
+  source.append("          unsigned int size, \n");
+  source.append("          __local unsigned int * shared_rows, \n");
+  source.append("          __local "); source.append(numeric_string); source.append(" * inter_results, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+  source.append("          unsigned int buffer_size, \n");
+  source.append("          __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+  source.append("          __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+  source.append("{ \n");
+  source.append("  cg_coo_prod(coords, elements, group_boundaries, p + offset_p, Ap + offset_Ap, size, shared_rows, inter_results, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
+  source.append("} \n \n");
+
+}
+
+
+template <typename StringType>
+void generate_ell_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
+{
+  source.append("__kernel void gmres_ell_prod( \n");
+  source.append("  __global const unsigned int * coords, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  unsigned int internal_row_num, \n");
+  source.append("  unsigned int items_per_row, \n");
+  source.append("  unsigned int aligned_items_per_row, \n");
+  source.append("  __global const "); source.app

<TRUNCATED>

[28/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
new file mode 100644
index 0000000..b7eaeb4
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
@@ -0,0 +1,3252 @@
+#ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/vector_operations.hpp
+    @brief Implementations of vector operations using a plain single-threaded execution on CPU
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+template<typename DestNumericT, typename SrcNumericT>
+__global__ void convert_kernel(DestNumericT      * dest, unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest,
+                               SrcNumericT const * src,  unsigned int start_src,  unsigned int inc_src)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                    i < size_dest;
+                    i += gridDim.x * blockDim.x)
+    dest[i*inc_dest+start_dest] = src[i*inc_src+start_src];
+}
+
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src)
+{
+  convert_kernel<<<128, 128>>>(viennacl::cuda_arg(dest),
+                              static_cast<unsigned int>(viennacl::traits::start(dest)),
+                              static_cast<unsigned int>(viennacl::traits::stride(dest)),
+                              static_cast<unsigned int>(viennacl::traits::size(dest)),
+
+                              viennacl::cuda_arg(src),
+                              static_cast<unsigned int>(viennacl::traits::start(src)),
+                              static_cast<unsigned int>(viennacl::traits::stride(src)) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("convert_kernel");
+}
+
+
+//////////////////////// av /////////////////////////////
+
+// gpu scalar
+template<typename NumericT>
+__global__ void av_kernel(NumericT * vec1,
+                          unsigned int start1,
+                          unsigned int inc1,
+                          unsigned int size1,
+
+                          const NumericT * fac2,
+                          unsigned int options2,
+                          const NumericT * vec2,
+                          unsigned int start2,
+                          unsigned int inc2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  if (options2 & (1 << 1))
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
+  }
+  else
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
+  }
+}
+
+// cpu scalar
+template<typename NumericT>
+__global__ void av_kernel(NumericT * vec1,
+                          unsigned int start1,
+                          unsigned int inc1,
+                          unsigned int size1,
+
+                          NumericT fac2,
+                          unsigned int options2,
+                          const NumericT * vec2,
+                          unsigned int start2,
+                          unsigned int inc2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  if (options2 & (1 << 1))
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
+  }
+  else
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
+  }
+}
+
+
+
+template<typename NumericT, typename ScalarType1>
+void av(vector_base<NumericT> & vec1,
+        vector_base<NumericT> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = -data_alpha;
+  if (reciprocal_alpha)
+    data_alpha = static_cast<value_type>(1) / data_alpha;
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarType1>::value)
+    temporary_alpha = alpha;
+
+  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                          static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                          static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                          static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                          viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                          options_alpha,
+                          viennacl::cuda_arg(vec2),
+                          static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                          static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+
+///////////////////// avbv //////////////////////////////////
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+                            unsigned int start1,
+                            unsigned int inc1,
+                            unsigned int size1,
+
+                            const NumericT * fac2,
+                            unsigned int options2,
+                            const NumericT * vec2,
+                            unsigned int start2,
+                            unsigned int inc2,
+
+                            const NumericT * fac3,
+                            unsigned int options3,
+                            const NumericT * vec3,
+                            unsigned int start3,
+                            unsigned int inc3)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+                            unsigned int start1,
+                            unsigned int inc1,
+                            unsigned int size1,
+
+                            NumericT fac2,
+                            unsigned int options2,
+                            const NumericT * vec2,
+                            unsigned int start2,
+                            unsigned int inc2,
+
+                            const NumericT * fac3,
+                            unsigned int options3,
+                            const NumericT * vec3,
+                            unsigned int start3,
+                            unsigned int inc3)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+                            unsigned int start1,
+                            unsigned int inc1,
+                            unsigned int size1,
+
+                            const NumericT * fac2,
+                            unsigned int options2,
+                            const NumericT * vec2,
+                            unsigned int start2,
+                            unsigned int inc2,
+
+                            NumericT fac3,
+                            unsigned int options3,
+                            const NumericT * vec3,
+                            unsigned int start3,
+                            unsigned int inc3)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+                            unsigned int start1,
+                            unsigned int inc1,
+                            unsigned int size1,
+
+                            NumericT fac2,
+                            unsigned int options2,
+                            const NumericT * vec2,
+                            unsigned int start2,
+                            unsigned int inc2,
+
+                            NumericT fac3,
+                            unsigned int options3,
+                            const NumericT * vec3,
+                            unsigned int start3,
+                            unsigned int inc3)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv(vector_base<NumericT> & vec1,
+          vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+          vector_base<NumericT> const & vec3, ScalarT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = -data_alpha;
+  if (reciprocal_alpha)
+    data_alpha = static_cast<value_type>(1) / data_alpha;
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarT1>::value)
+    temporary_alpha = alpha;
+
+  unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  value_type temporary_beta = 0;
+  if (viennacl::is_cpu_scalar<ScalarT2>::value)
+    temporary_beta = beta;
+
+
+  avbv_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                            static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                            static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                            static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                            viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                            options_alpha,
+                            viennacl::cuda_arg(vec2),
+                            static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                            static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+
+                            viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                            options_beta,
+                            viennacl::cuda_arg(vec3),
+                            static_cast<unsigned int>(viennacl::traits::start(vec3)),
+                            static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_kernel");
+}
+
+
+////////////////////////// avbv_v //////////////////////////////////////
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+                              unsigned int start1,
+                              unsigned int inc1,
+                              unsigned int size1,
+
+                              const NumericT * fac2,
+                              unsigned int options2,
+                              const NumericT * vec2,
+                              unsigned int start2,
+                              unsigned int inc2,
+
+                              const NumericT * fac3,
+                              unsigned int options3,
+                              const NumericT * vec3,
+                              unsigned int start3,
+                              unsigned int inc3)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+                              unsigned int start1,
+                              unsigned int inc1,
+                              unsigned int size1,
+
+                              NumericT fac2,
+                              unsigned int options2,
+                              const NumericT * vec2,
+                              unsigned int start2,
+                              unsigned int inc2,
+
+                              const NumericT * fac3,
+                              unsigned int options3,
+                              const NumericT * vec3,
+                              unsigned int start3,
+                              unsigned int inc3)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+                              unsigned int start1,
+                              unsigned int inc1,
+                              unsigned int size1,
+
+                              const NumericT * fac2,
+                              unsigned int options2,
+                              const NumericT * vec2,
+                              unsigned int start2,
+                              unsigned int inc2,
+
+                              NumericT fac3,
+                              unsigned int options3,
+                              const NumericT * vec3,
+                              unsigned int start3,
+                              unsigned int inc3)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+                              unsigned int start1,
+                              unsigned int inc1,
+                              unsigned int size1,
+
+                              NumericT fac2,
+                              unsigned int options2,
+                              const NumericT * vec2,
+                              unsigned int start2,
+                              unsigned int inc2,
+
+                              NumericT fac3,
+                              unsigned int options3,
+                              const NumericT * vec3,
+                              unsigned int start3,
+                              unsigned int inc3)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv_v(vector_base<NumericT> & vec1,
+            vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+            vector_base<NumericT> const & vec3, ScalarT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = -data_alpha;
+  if (reciprocal_alpha)
+    data_alpha = static_cast<value_type>(1) / data_alpha;
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarT1>::value)
+    temporary_alpha = alpha;
+
+  unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  value_type temporary_beta = 0;
+  if (viennacl::is_cpu_scalar<ScalarT2>::value)
+    temporary_beta = beta;
+
+
+  avbv_v_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                              viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                              options_alpha,
+                              viennacl::cuda_arg(vec2),
+                              static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                              static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+
+                              viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                              options_beta,
+                              viennacl::cuda_arg(vec3),
+                              static_cast<unsigned int>(viennacl::traits::start(vec3)),
+                              static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
+}
+
+
+//////////////////////////
+
+template<typename NumericT>
+__global__ void vector_assign_kernel(NumericT * vec1,
+                                     unsigned int start1,
+                                     unsigned int inc1,
+                                     unsigned int size1,
+                                     unsigned int internal_size1,
+
+                                     NumericT alpha)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                    i < size1;
+                    i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] =  (i < size1) ? alpha : 0;
+}
+
+/** @brief Assign a constant value to a vector (-range/-slice)
+*
+* @param vec1   The vector to which the value should be assigned
+* @param alpha  The value to be assigned
+* @param up_to_internal_size  Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
+*/
+template<typename NumericT, typename ScalarT1>
+void vector_assign(vector_base<NumericT> & vec1, ScalarT1 const & alpha, bool up_to_internal_size = false)
+{
+  typedef NumericT        value_type;
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarT1>::value)
+    temporary_alpha = alpha;
+
+  unsigned int size = up_to_internal_size ? static_cast<unsigned int>(vec1.internal_size()) : static_cast<unsigned int>(viennacl::traits::size(vec1));
+
+  vector_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                     static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                     static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                     size,
+                                     static_cast<unsigned int>(vec1.internal_size()),  //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
+
+                                     viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_assign_kernel");
+}
+
+//////////////////////////
+
+template<typename NumericT>
+__global__ void vector_swap_kernel(NumericT * vec1,
+                                   unsigned int start1,
+                                   unsigned int inc1,
+                                   unsigned int size1,
+
+                                   NumericT * vec2,
+                                   unsigned int start2,
+                                   unsigned int inc2)
+{
+  NumericT tmp;
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                    i < size1;
+                    i += gridDim.x * blockDim.x)
+  {
+    tmp = vec2[i*inc2+start2];
+    vec2[i*inc2+start2] = vec1[i*inc1+start1];
+    vec1[i*inc1+start1] = tmp;
+  }
+}
+
+
+/** @brief Swaps the contents of two vectors, data is copied
+*
+* @param vec1   The first vector (or -range, or -slice)
+* @param vec2   The second vector (or -range, or -slice)
+*/
+template<typename NumericT>
+void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2)
+{
+  vector_swap_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                   static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                   static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                   static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                   viennacl::cuda_arg(vec2),
+                                   static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                   static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_swap_kernel");
+}
+
+///////////////////////// Binary Elementwise operations /////////////
+
+template<typename NumericT>
+__global__ void element_op_kernel(NumericT * vec1,
+                                   unsigned int start1,
+                                   unsigned int inc1,
+                                   unsigned int size1,
+
+                                   NumericT const * vec2,
+                                   unsigned int start2,
+                                   unsigned int inc2,
+
+                                   NumericT const * vec3,
+                                   unsigned int start3,
+                                   unsigned int inc3,
+
+                                   unsigned int op_type
+                                 )
+{
+  if (op_type == 2)
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+    {
+      vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
+    }
+  }
+  else if (op_type == 1)
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+    {
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
+    }
+  }
+  else if (op_type == 0)
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+    {
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
+    }
+  }
+}
+
+template<typename NumericT>
+__global__ void element_op_int_kernel(NumericT * vec1,
+                                   unsigned int start1,
+                                   unsigned int inc1,
+                                   unsigned int size1,
+
+                                   NumericT const * vec2,
+                                   unsigned int start2,
+                                   unsigned int inc2,
+
+                                   NumericT const * vec3,
+                                   unsigned int start3,
+                                   unsigned int inc3,
+
+                                   unsigned int op_type
+                                 )
+{
+  if (op_type == 1)
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+    {
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
+    }
+  }
+  else if (op_type == 0)
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+    {
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
+    }
+  }
+}
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3    (using MATLAB syntax)
+*
+* @param vec1   The result vector (or -range, or -slice)
+* @param proxy  The proxy object holding v2, v3 and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy)
+{
+  unsigned int op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OpT>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OpT>::value)
+    op_type = 0;
+
+  element_op_int_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                  viennacl::cuda_arg(proxy.lhs()),
+                                  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+                                  viennacl::cuda_arg(proxy.rhs()),
+                                  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+                                  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+                                  op_type
+                                 );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+}
+
+template<typename OpT>
+void element_op(vector_base<float> & vec1,
+                vector_expression<const vector_base<float>, const vector_base<float>, op_element_binary<OpT> > const & proxy)
+{
+  unsigned int op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OpT>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OpT>::value)
+    op_type = 0;
+
+  element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                  viennacl::cuda_arg(proxy.lhs()),
+                                  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+                                  viennacl::cuda_arg(proxy.rhs()),
+                                  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+                                  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+                                  op_type
+                                 );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+}
+
+template<typename OpT>
+void element_op(vector_base<double> & vec1,
+                vector_expression<const vector_base<double>, const vector_base<double>, op_element_binary<OpT> > const & proxy)
+{
+  unsigned int op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OpT>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OpT>::value)
+    op_type = 0;
+
+  element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                  viennacl::cuda_arg(proxy.lhs()),
+                                  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+                                  viennacl::cuda_arg(proxy.rhs()),
+                                  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+                                  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+                                  op_type
+                                 );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+}
+
+///////////////////////// Unary Elementwise operations /////////////
+
+// Note: Trying to automate things with macros or template metaprogramming failed (preprocessor with nvcc did not work as expected), so this is terribly hand-rolled code
+// Question (Karl Rupp): Why is CUDA code always such a hassle when trying to use it in a library context?
+
+// acos
+template<typename NumericT>
+__global__ void vec_element_acos_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = acos(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_acos> > const & proxy)
+{
+  typedef NumericT        value_type;
+
+  vec_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_acos_kernel");
+}
+
+// asin
+template<typename NumericT>
+__global__ void vec_element_asin_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = asin(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_asin> > const & proxy)
+{
+  vec_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_asin_kernel");
+}
+
+
+// atan
+template<typename NumericT>
+__global__ void vec_element_atan_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = atan(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_atan> > const & proxy)
+{
+  vec_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_atan_kernel");
+}
+
+
+// ceil
+template<typename NumericT>
+__global__ void vec_element_ceil_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = ceil(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_ceil> > const & proxy)
+{
+  vec_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_ceil_kernel");
+}
+
+
+// cos
+template<typename NumericT>
+__global__ void vec_element_cos_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = cos(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_cos> > const & proxy)
+{
+  vec_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cos_kernel");
+}
+
+
+// cosh
+template<typename NumericT>
+__global__ void vec_element_cosh_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = cosh(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_cosh> > const & proxy)
+{
+  vec_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cosh_kernel");
+}
+
+
+// exp
+template<typename NumericT>
+__global__ void vec_element_exp_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = exp(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_exp> > const & proxy)
+{
+  vec_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_exp_kernel");
+}
+
+
+// fabs
+template<typename NumericT>
+__global__ void vec_element_fabs_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = fabs(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_fabs> > const & proxy)
+{
+  vec_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_fabs_kernel");
+}
+
+// abs
+template<typename NumericT>
+__global__ void vec_element_abs_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = abs(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_abs> > const & proxy)
+{
+  vec_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                       static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                       static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                       static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                       viennacl::cuda_arg(proxy.lhs()),
+                                       static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                       static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                      );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_abs_kernel");
+}
+
+
+
+// floor
+template<typename NumericT>
+__global__ void vec_element_floor_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = floor(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_floor> > const & proxy)
+{
+  vec_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_floor_kernel");
+}
+
+
+// log
+template<typename NumericT>
+__global__ void vec_element_log_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = log(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_log> > const & proxy)
+{
+  vec_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log_kernel");
+}
+
+
+// log10
+template<typename NumericT>
+__global__ void vec_element_log10_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = log10(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_log10> > const & proxy)
+{
+  vec_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log10_kernel");
+}
+
+
+// sin
+template<typename NumericT>
+__global__ void vec_element_sin_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = sin(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sin> > const & proxy)
+{
+  vec_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sin_kernel");
+}
+
+
+// sinh
+template<typename NumericT>
+__global__ void vec_element_sinh_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = sinh(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sinh> > const & proxy)
+{
+  vec_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sinh_kernel");
+}
+
+
+// sqrt
+template<typename NumericT>
+__global__ void vec_element_sqrt_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = sqrt(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sqrt> > const & proxy)
+{
+  vec_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sqrt_kernel");
+}
+
+
+// tan
+template<typename NumericT>
+__global__ void vec_element_tan_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = tan(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_tan> > const & proxy)
+{
+  vec_element_tan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tan_kernel");
+}
+
+
+// tanh
+template<typename NumericT>
+__global__ void vec_element_tanh_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = tanh(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_tanh> > const & proxy)
+{
+  vec_element_tanh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tanh_kernel");
+}
+
+
+
+///////////////////////// Norms and inner product ///////////////////
+
+
+template<typename NumericT>
+__global__ void inner_prod_kernel(const NumericT * vec1,
+                                  unsigned int start1,
+                                  unsigned int inc1,
+                                  unsigned int size1,
+                                  const NumericT * vec2,
+                                  unsigned int start2,
+                                  unsigned int inc2,
+                                  unsigned int size2,
+                                  NumericT * group_buffer)
+{
+  __shared__ NumericT tmp_buffer[128];
+  unsigned int group_start1 = (blockIdx.x * size1) / (gridDim.x) * inc1 + start1;
+  unsigned int group_start2 = (blockIdx.x * size2) / (gridDim.x) * inc2 + start2;
+
+  unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
+                               - (  blockIdx.x * size1) / (gridDim.x);
+
+
+  NumericT tmp = 0;
+  for (unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
+    tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
+  tmp_buffer[threadIdx.x] = tmp;
+
+  // parallel reduction
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+      tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
+  }
+
+  if (threadIdx.x == 0)
+    group_buffer[blockIdx.x] = tmp_buffer[0];
+
+}
+
+
+
+// sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+template<typename NumericT>
+__global__ void vector_sum_kernel_floats(
+          const NumericT * vec1,
+          unsigned int start1,
+          unsigned int inc1,
+          unsigned int size1,
+          unsigned int option, //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
+          NumericT * result)
+{
+  __shared__ NumericT tmp_buffer[128];
+  NumericT thread_sum = 0;
+  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+  {
+    if (option > 0)
+      thread_sum += vec1[i*inc1+start1];
+    else
+      thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
+  }
+
+  tmp_buffer[threadIdx.x] = thread_sum;
+
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      if (option > 0)
+        tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+      else
+        tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x + stride]);
+    }
+  }
+
+  if (threadIdx.x == 0)
+  {
+    if (option == 2)
+      *result = sqrt(tmp_buffer[0]);
+    else
+      *result = tmp_buffer[0];
+  }
+}
+
+template<typename NumericT>
+__global__ void vector_sum_kernel_integers(
+          const NumericT * vec1,
+          unsigned int start1,
+          unsigned int inc1,
+          unsigned int size1,
+          unsigned int option, //0: use max, 1: just sum
+          NumericT * result)
+{
+  __shared__ NumericT tmp_buffer[128];
+  NumericT thread_sum = 0;
+  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+  {
+    if (option > 0)
+      thread_sum += vec1[i*inc1+start1];
+    else
+      thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]);
+  }
+
+  tmp_buffer[threadIdx.x] = thread_sum;
+
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      if (option > 0)
+        tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+      else
+        tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
+    }
+  }
+
+  if (threadIdx.x == 0)
+    *result = tmp_buffer[0];
+}
+
+template<typename NumericT>
+__global__ void vector_sum_kernel_unsigned_integers(
+          const NumericT * vec1,
+          unsigned int start1,
+          unsigned int inc1,
+          unsigned int size1,
+          unsigned int option, //0: use max, 1: just sum
+          NumericT * result)
+{
+  __shared__ NumericT tmp_buffer[128];
+  NumericT thread_sum = 0;
+  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+  {
+    if (option > 0)
+      thread_sum += vec1[i*inc1+start1];
+    else
+      thread_sum = (thread_sum > vec1[i*inc1+start1]) ? thread_sum : vec1[i*inc1+start1];
+  }
+
+  tmp_buffer[threadIdx.x] = thread_sum;
+
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      if (option > 0)
+        tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+      else
+        tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
+    }
+  }
+
+  if (threadIdx.x == 0)
+    *result = tmp_buffer[0];
+}
+
+namespace detail
+{
+  /** \cond */
+  struct vector_sum_kernel_launcher_integers
+  {
+    template<typename NumericT, typename ScalarT>
+    static void apply(vector_base<NumericT> const & temp,
+                      unsigned int option,
+                      ScalarT & result)
+    {
+      typedef NumericT        value_type;
+      vector_sum_kernel_integers<<<1, 128>>>(viennacl::cuda_arg(temp),
+                                            static_cast<unsigned int>(viennacl::traits::start(temp)),
+                                            static_cast<unsigned int>(viennacl::traits::stride(temp)),
+                                            static_cast<unsigned int>(viennacl::traits::size(temp)),
+                                            static_cast<unsigned int>(option),
+                                            viennacl::cuda_arg(result) );
+      VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+    }
+  };
+
+  struct vector_sum_kernel_launcher_unsigned_integers
+  {
+    template<typename NumericT, typename ScalarT>
+    static void apply(vector_base<NumericT> const & temp,
+                      unsigned int option,
+                      ScalarT & result)
+    {
+      typedef NumericT        value_type;
+      vector_sum_kernel_unsigned_integers<<<1, 128>>>(viennacl::cuda_arg(temp),
+                                                      static_cast<unsigned int>(viennacl::traits::start(temp)),
+                                                      static_cast<unsigned int>(viennacl::traits::stride(temp)),
+                                                      static_cast<unsigned int>(viennacl::traits::size(temp)),
+                                                      static_cast<unsigned int>(option),
+                                                      viennacl::cuda_arg(result) );
+      VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+    }
+  };
+
+  struct vector_sum_kernel_launcher_floats
+  {
+    template<typename NumericT, typename ScalarT>
+    static void apply(vector_base<NumericT> const & temp,
+                      unsigned int option,
+                      ScalarT & result)
+    {
+      typedef NumericT        value_type;
+      vector_sum_kernel_floats<<<1, 128>>>(viennacl::cuda_arg(temp),
+                                            static_cast<unsigned int>(viennacl::traits::start(temp)),
+                                            static_cast<unsigned int>(viennacl::traits::stride(temp)),
+                                            static_cast<unsigned int>(viennacl::traits::size(temp)),
+                                            static_cast<unsigned int>(option),
+                                            viennacl::cuda_arg(result) );
+      VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+    }
+  };
+
+  template<typename NumericT>
+  struct vector_sum_kernel_launcher : public vector_sum_kernel_launcher_integers {};
+
+  template<>
+  struct vector_sum_kernel_launcher<unsigned char>  : public vector_sum_kernel_launcher_unsigned_integers {};
+
+  template<>
+  struct vector_sum_kernel_launcher<unsigned short>  : public vector_sum_kernel_launcher_unsigned_integers {};
+
+  template<>
+  struct vector_sum_kernel_launcher<unsigned int>  : public vector_sum_kernel_launcher_unsigned_integers {};
+
+  template<>
+  struct vector_sum_kernel_launcher<unsigned long>  : public vector_sum_kernel_launcher_unsigned_integers {};
+
+  template<>
+  struct vector_sum_kernel_launcher<float>  : public vector_sum_kernel_launcher_floats {};
+
+  template<>
+  struct vector_sum_kernel_launcher<double> : public vector_sum_kernel_launcher_floats {};
+
+  /** \endcond */
+}
+
+
+//implementation of inner product:
+//namespace {
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the gpu)
+*/
+template<typename NumericT, typename ScalarT>
+void inner_prod_impl(vector_base<NumericT> const & vec1,
+                     vector_base<NumericT> const & vec2,
+                     ScalarT & result)
+{
+  typedef NumericT        value_type;
+
+  static const unsigned int work_groups = 128;
+  static viennacl::vector<value_type> temp(work_groups);
+
+  inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                  viennacl::cuda_arg(vec2),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec2)),
+                                  viennacl::cuda_arg(temp)
+                                 );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
+
+  detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
+}
+
+
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the host)
+*/
+template<typename NumericT>
+void inner_prod_cpu(vector_base<NumericT> const & vec1,
+                    vector_base<NumericT> const & vec2,
+                    NumericT & result)
+{
+  typedef NumericT        value_type;
+
+  const unsigned int work_groups = 128;
+  viennacl::vector<value_type> temp(work_groups);
+
+  inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                  viennacl::cuda_arg(vec2),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec2)),
+                                  viennacl::cuda_arg(temp)
+                                 );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
+
+  // Now copy partial results from GPU back to CPU and run reduction there:
+  std::vector<value_type> temp_cpu(work_groups);
+  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+  result = 0;
+  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+    result += *it;
+}
+
+///////////////////////////////////
+
+#define VIENNACL_MDOT_WORKGROUP_SIZE  128
+#define VIENNACL_MDOT_WORKGROUP_NUM   128
+// M = 2:
+template<typename NumericT>
+__global__ void inner_prod_2_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
+                                    const NumericT *y0, unsigned int start0, unsigned int stride0,
+                                    const NumericT *y1, unsigned int start1, unsigned int stride1,
+                                    NumericT *group_results)
+{
+  __shared__ NumericT tmp_buffer[2*VIENNACL_MDOT_WORKGROUP_SIZE];
+  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+  unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond size of x
+
+  NumericT entry_x    = 0;
+  NumericT group_sum0 = 0;
+  NumericT group_sum1 = 0;
+  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+    entry_x     = x[i * stridex + startx];   // load only once from global memory!
+    group_sum0 += entry_x * y0[i * stride0 + start0];
+    group_sum1 += entry_x * y1[i * stride1 + start1];
+  }
+  tmp_buffer[threadIdx.x]              = group_sum0;
+  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
+
+  // parallel reduction
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+    __syncthreads();
+    if (threadIdx.x < stride) {
+      tmp_buffer[threadIdx.x             ] += tmp_buffer[threadIdx.x+stride             ];
+      tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
+    }
+  }
+
+  // write result of group to group_results
+  if (threadIdx.x == 0) {
+    group_results[blockIdx.x]             = tmp_buffer[0];
+    group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
+  }
+}
+
+// M = 3:
+template<typename NumericT>
+__global__ void inner_prod_3_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
+                                    const NumericT *y0, unsigned int start0, unsigned int stride0,
+                                    const NumericT *y1, unsigned int start1, unsigned int stride1,
+                                    const NumericT *y2, unsigned int start2, unsigned int stride2,
+                                    NumericT *group_results)
+{
+  __shared__ NumericT tmp_buffer[3*VIENNACL_MDOT_WORKGROUP_SIZE];
+  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+  unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+  NumericT entry_x    = 0;
+  NumericT group_sum0 = 0;
+  NumericT group_sum1 = 0;
+  NumericT group_sum2 = 0;
+  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+    entry_x     = x[i * stridex + startx];   // load only once from global memory!
+    group_sum0 += entry_x * y0[i * stride0 + start0];
+    group_sum1 += entry_x * y1[i * stride1 + start1];
+    group_sum2 += entry_x * y2[i * stride2 + start2];
+  }
+  tmp_buffer[threadIdx.x]                  = group_sum0;
+  tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
+  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+
+  // parallel reduction
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+    __syncthreads();
+    if (threadIdx.x < stride) {
+      tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+stride                 ];
+      tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+stride +     blockDim.x];
+      tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+    }
+  }
+
+  // write result of group to group_results
+  if (threadIdx.x == 0) {
+    group_results[blockIdx.x                ] = tmp_buffer[0];
+    group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
+    group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+  }
+}
+
+// M = 4:
+template<typename NumericT>
+__global__ void inner_prod_4_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
+                                    const NumericT *y0, unsigned int start0, unsigned int stride0,
+                                    const NumericT *y1, unsigned int start1, unsigned int stride1,
+                                    const NumericT *y2, unsigned int start2, unsigned int stride2,
+                                    const NumericT *y3, unsigned int start3, unsigned int stride3,
+                                    NumericT *group_results)
+{
+  __shared__ NumericT tmp_buffer[4*VIENNACL_MDOT_WORKGROUP_SIZE];
+  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+  unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+  NumericT entry_x    = 0;
+  NumericT group_sum0 = 0;
+  NumericT group_sum1 = 0;
+  NumericT group_sum2 = 0;
+  NumericT group_sum3 = 0;
+  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+    entry_x     = x[i * stridex + startx];   // load only once from global memory!
+    group_sum0 += entry_x * y0[i * stride0 + start0];
+    group_sum1 += entry_x * y1[i * stride1 + start1];
+    group_sum2 += entry_x * y2[i * stride2 + start2];
+    group_sum3 += entry_x * y3[i * stride3 + start3];
+  }
+  tmp_buffer[threadIdx.x]                  = group_sum0;
+  tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
+  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+  tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
+
+  // parallel reduction
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+    __syncthreads();
+    if (threadIdx.x < stride) {
+      tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+stride                 ];
+      tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+stride +     blockDim.x];
+      tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+      tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
+    }
+  }
+
+  // write result of group to group_results
+  if (threadIdx.x == 0) {
+    group_results[blockIdx.x                ] = tmp_buffer[0];
+    group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
+    group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+    group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
+  }
+}
+
+// M = 8:
+template<typename NumericT>
+__global__ void inner_prod_8_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
+                                    const NumericT *y0, unsigned int start0, unsigned int stride0,
+                                    const NumericT *y1, unsigned int start1, unsigned int stride1,
+                                    const NumericT *y2, unsigned int start2, unsigned int stride2,
+                                    const NumericT *y3, unsigned int start3, unsigned int stride3,
+                                    const NumericT *y4, unsigned int start4, unsigned int stride4,
+                                    const NumericT *y5, unsigned int start5, unsigned int stride5,
+                                    const NumericT *y6, unsigned int start6, unsigned int stride6,
+                                    const NumericT *y7, unsigned int start7, unsigned int stride7,
+                                    NumericT *group_results)
+{
+  __shared__ NumericT tmp_buffer[8*VIENNACL_MDOT_WORKGROUP_SIZE];
+  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+  unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+  NumericT entry_x    = 0;
+  NumericT group_sum0 = 0;
+  NumericT group_sum1 = 0;
+  NumericT group_sum2 = 0;
+  NumericT group_sum3 = 0;
+  NumericT group_sum4 = 0;
+  NumericT group_sum5 = 0;
+  NumericT group_sum6 = 0;
+  NumericT group_sum7 = 0;
+  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+    entry_x     = x[i * stridex + startx];   // load only once from global memory!
+    group_sum0 += entry_x * y0[i * stride0 + start0];
+    group_sum1 += entry_x * y1[i * stride1 + start1];
+    group_sum2 += entry_x * y2[i * stride2 + start2];
+    group_sum3 += entry_x * y3[i * stride3 + start3];
+    group_sum4 += entry_x * y4[i * stride4 + start4];
+    group_sum5 += entry_x * y5[i * stride5 + start5];
+    group_sum6 += entry_x * y6[i * stride6 + start6];
+    group_sum7 += entry_x * y7[i * stride7 + start7];
+  }
+  tmp_buffer[threadIdx.x]                  = group_sum0;
+  tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
+  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+  tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
+  tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
+  tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
+  tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
+  tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
+
+  // parallel reduction
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+    __syncthreads();
+    if (threadIdx.x < stride) {
+      tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+stride                 ];
+      tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+stride +     blockDim.x];
+      tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+      tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
+      tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 4 * blockDim.x];
+      tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 5 * blockDim.x];
+      tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 6 * blockDim.x];
+      tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 7 * blockDim.x];
+    }
+  }
+
+  // write result of group to group_results
+  if (threadIdx.x == 0) {
+    group_results[blockIdx.x                ] = tmp_buffer[0];
+    group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
+    group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+    group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
+    group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
+    group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
+    group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
+    group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
+  }
+}
+
+// sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+template<typename NumericT>
+__global__ void vector_multi_sum_kernel(
+          NumericT const * vec1,
+          NumericT * result,
+          unsigned int start_result,
+          unsigned int inc_result)
+{
+  __shared__ NumericT tmp_buffer[VIENNACL_MDOT_WORKGROUP_SIZE];
+
+  tmp_buffer[threadIdx.x] = vec1[threadIdx.x + blockIdx.x * VIENNACL_MDOT_WORKGROUP_SIZE];
+
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+      tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+  }
+
+  if (threadIdx.x == 0)
+    result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
+}
+
+template<typename NumericT>
+void inner_prod_impl(vector_base<NumericT> const & x,
+                     vector_tuple<NumericT> const & vec_tuple,
+                     vector_base<NumericT> & result)
+{
+  typedef NumericT        value_type;
+
+  static viennacl::vector<value_type> temp(8 * VIENNACL_MDOT_WORKGROUP_NUM);
+
+  vcl_size_t current_index = 0;
+  while (vec_tuple.const_size() > current_index)
+  {
+    switch (vec_tuple.const_size() - current_index)
+    {
+      case 7:
+      case 6:
+      case 5:
+      case 4:
+      {
+        vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
+        vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
+        vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
+        vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
+
+        inner_prod_4_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM,
+                              VIENNACL_MDOT_WORKGROUP_SIZE>>>( viennacl::cuda_arg(x),
+                                                               static_cast<unsigned int>(viennacl::traits::start(x)),
+                                                               static_cast<unsigned int>(viennacl::traits::stride(x)),
+                                                               static_cast<unsigned int>(viennacl::traits::size(x)),
+                                                               viennacl::cuda_arg(y0),
+                                                               static_cast<unsigned int>(viennacl::traits::start(y0)),
+                                                               static_cast<unsigned int>(viennacl::traits::stride(y0)),
+                                                               viennacl::cuda_arg(y1),
+                                                               static_cast<unsigned int>(viennacl::traits::start(y1)),
+                                                               static_cast<unsigned int>(viennacl::traits::stride(y1)),
+                                                               viennacl::cuda_arg(y2),
+                                                               static_cast<unsigned int>(vienna

<TRUNCATED>

[14/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp
new file mode 100644
index 0000000..959bbd8
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp
@@ -0,0 +1,108 @@
+#ifndef VIENNACL_LINALG_NORM_INF_HPP_
+#define VIENNACL_LINALG_NORM_INF_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file norm_inf.hpp
+    @brief Generic interface for the l^infty-norm. See viennacl/linalg/vector_operations.hpp for implementations.
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+
+namespace viennacl
+{
+  //
+  // generic norm_inf function
+  //   uses tag dispatch to identify which algorithm
+  //   should be called
+  //
+  namespace linalg
+  {
+
+    #ifdef VIENNACL_WITH_UBLAS
+    // ----------------------------------------------------
+    // UBLAS
+    //
+    template< typename VectorT >
+    typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
+                                  typename VectorT::value_type
+                                >::type
+    norm_inf(VectorT const& v1)
+    {
+      return boost::numeric::ublas::norm_inf(v1);
+    }
+    #endif
+
+
+    // ----------------------------------------------------
+    // STL
+    //
+    template< typename T, typename A >
+    T norm_inf(std::vector<T, A> const & v1)
+    {
+      //std::cout << "stl .. " << std::endl;
+      T result = 0;
+      for (typename std::vector<T, A>::size_type i=0; i<v1.size(); ++i)
+      {
+        if (std::fabs(v1[i]) > result)
+          result = std::fabs(v1[i]);
+      }
+
+      return result;
+    }
+
+    // ----------------------------------------------------
+    // VIENNACL
+    //
+    template< typename ScalarType>
+    viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                 const viennacl::vector_base<ScalarType>,
+                                 viennacl::op_norm_inf >
+    norm_inf(viennacl::vector_base<ScalarType> const & v1)
+    {
+       //std::cout << "viennacl .. " << std::endl;
+      return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                          const viennacl::vector_base<ScalarType>,
+                                          viennacl::op_norm_inf >(v1, v1);
+    }
+
+    // with vector expression:
+    template<typename LHS, typename RHS, typename OP>
+    viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                viennacl::op_norm_inf>
+    norm_inf(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+    {
+      return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          viennacl::op_norm_inf >(vector, vector);
+    }
+
+
+  } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp
new file mode 100644
index 0000000..7cdcf89
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp
@@ -0,0 +1,458 @@
+#ifndef VIENNACL_LINALG_OPENCL_AMG_OPERATIONS_HPP
+#define VIENNACL_LINALG_OPENCL_AMG_OPERATIONS_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file opencl/amg_operations.hpp
+    @brief Implementations of routines for AMG in OpenCL.
+*/
+
+#include <cstdlib>
+#include <cmath>
+#include <map>
+
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/amg.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace amg
+{
+
+
+///////////////////////////////////////////
+
+/** @brief Routine for taking all connections in the matrix as strong */
+template<typename NumericT>
+void amg_influence_trivial(compressed_matrix<NumericT> const & A,
+                           viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                           viennacl::linalg::amg_tag & tag)
+{
+  (void)tag;
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+  viennacl::ocl::kernel & influence_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_influence_trivial");
+
+  viennacl::ocl::enqueue(influence_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(),
+                                          cl_uint(A.size1()),
+                                          cl_uint(A.nnz()),
+                                          viennacl::traits::opencl_handle(amg_context.influence_jumper_),
+                                          viennacl::traits::opencl_handle(amg_context.influence_ids_),
+                                          viennacl::traits::opencl_handle(amg_context.influence_values_)
+                                         )
+                         );
+}
+
+
+/** @brief Routine for extracting strongly connected points considering a user-provided threshold value */
+template<typename NumericT>
+void amg_influence_advanced(compressed_matrix<NumericT> const & A,
+                            viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                            viennacl::linalg::amg_tag & tag)
+{
+  (void)A; (void)amg_context; (void)tag;
+  throw std::runtime_error("amg_influence_advanced() not implemented for OpenCL yet");
+}
+
+
+/** @brief Dispatcher for influence processing */
+template<typename NumericT>
+void amg_influence(compressed_matrix<NumericT> const & A,
+                   viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                   viennacl::linalg::amg_tag & tag)
+{
+  // TODO: dispatch based on influence tolerance provided
+  amg_influence_trivial(A, amg_context, tag);
+}
+
+
+
+/** @brief Assign IDs to coarse points.
+*
+*  TODO: Use exclusive_scan on GPU for this.
+*/
+inline void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & amg_context)
+{
+  viennacl::backend::typesafe_host_array<unsigned int> point_types(amg_context.point_types_.handle(), amg_context.point_types_.size());
+  viennacl::backend::typesafe_host_array<unsigned int> coarse_ids(amg_context.coarse_id_.handle(),    amg_context.coarse_id_.size());
+  viennacl::backend::memory_read(amg_context.point_types_.handle(), 0, point_types.raw_size(), point_types.get());
+  viennacl::backend::memory_read(amg_context.coarse_id_.handle(),   0, coarse_ids.raw_size(),  coarse_ids.get());
+
+  unsigned int coarse_id = 0;
+  for (std::size_t i=0; i<amg_context.point_types_.size(); ++i)
+  {
+    coarse_ids.set(i, coarse_id);
+    if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+      ++coarse_id;
+  }
+
+  amg_context.num_coarse_ = coarse_id;
+
+  viennacl::backend::memory_write(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get());
+}
+
+
+//////////////////////////////////////
+
+
+
+/** @brief AG (aggregation based) coarsening, single-threaded version of stage 1
+*
+* @param A             Operator matrix on all levels
+* @param amg_context   AMG hierarchy datastructures
+* @param tag           AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A,
+                               viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                               viennacl::linalg::amg_tag & tag)
+{
+  (void)tag;
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+
+  viennacl::vector<unsigned int> random_weights(A.size1(), viennacl::context(viennacl::MAIN_MEMORY));
+  unsigned int *random_weights_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(random_weights.handle());
+  for (std::size_t i=0; i<random_weights.size(); ++i)
+    random_weights_ptr[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.size1());
+  random_weights.switch_memory_context(viennacl::traits::context(A));
+
+  // work vectors:
+  viennacl::vector<unsigned int> work_state(A.size1(),  viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_random(A.size1(), viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_index(A.size1(),  viennacl::traits::context(A));
+
+  viennacl::vector<unsigned int> work_state2(A.size1(),  viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_random2(A.size1(), viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_index2(A.size1(),  viennacl::traits::context(A));
+
+  unsigned int num_undecided = static_cast<unsigned int>(A.size1());
+  viennacl::vector<unsigned int> undecided_buffer(256, viennacl::traits::context(A));
+  viennacl::backend::typesafe_host_array<unsigned int> undecided_buffer_host(undecided_buffer.handle(), undecided_buffer.size());
+
+  viennacl::ocl::kernel & init_workdata_kernel    = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_init_workdata");
+  viennacl::ocl::kernel & max_neighborhood_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_max_neighborhood");
+  viennacl::ocl::kernel & mark_mis_nodes_kernel   = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_mark_mis_nodes");
+  viennacl::ocl::kernel & reset_state_kernel      = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_reset_state");
+
+  unsigned int pmis_iters = 0;
+  while (num_undecided > 0)
+  {
+    ++pmis_iters;
+
+    //
+    // init temporary work data:
+    //
+    viennacl::ocl::enqueue(init_workdata_kernel(work_state,  work_random,  work_index,
+                                                amg_context.point_types_,
+                                                random_weights,
+                                                cl_uint(A.size1())
+                                               )
+                          );
+
+    //
+    // Propagate maximum tuple twice
+    //
+    for (unsigned int r = 0; r < 2; ++r)
+    {
+      // max operation
+      viennacl::ocl::enqueue(max_neighborhood_kernel(work_state,  work_random,  work_index,
+                                                     work_state2, work_random2, work_index2,
+                                                     amg_context.influence_jumper_, amg_context.influence_ids_,
+                                                     cl_uint(A.size1())
+                                                    )
+                            );
+
+      // copy work array (can be fused into a single kernel if needed. Previous kernel is in most cases sufficiently heavy)
+      work_state  = work_state2;
+      work_random = work_random2;
+      work_index  = work_index2;
+    }
+
+    //
+    // mark MIS and non-MIS nodes:
+    //
+    viennacl::ocl::enqueue(mark_mis_nodes_kernel(work_state, work_index,
+                                                 amg_context.point_types_,
+                                                 undecided_buffer,
+                                                 cl_uint(A.size1())
+                                                )
+                          );
+
+    // get number of undecided points on host:
+    viennacl::backend::memory_read(undecided_buffer.handle(), 0, undecided_buffer_host.raw_size(), undecided_buffer_host.get());
+    num_undecided = 0;
+    for (std::size_t i=0; i<undecided_buffer.size(); ++i)
+      num_undecided += undecided_buffer_host[i];
+
+  } //while
+
+  viennacl::ocl::enqueue(reset_state_kernel(amg_context.point_types_, cl_uint(amg_context.point_types_.size()) ) );
+}
+
+
+
+/** @brief AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG)
+*
+* @param A             Operator matrix
+* @param amg_context   AMG hierarchy datastructures
+* @param tag           AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag(compressed_matrix<NumericT> const & A,
+                   viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                   viennacl::linalg::amg_tag & tag)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+
+  amg_influence_trivial(A, amg_context, tag);
+
+  //
+  // Stage 1: Build aggregates:
+  //
+  if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION)
+    amg_coarse_ag_stage1_mis2(A, amg_context, tag);
+  else
+    throw std::runtime_error("Only MIS2 coarsening implemented. Selected coarsening not available with OpenCL backend!");
+
+  viennacl::linalg::opencl::amg::enumerate_coarse_points(amg_context);
+
+  //
+  // Stage 2: Propagate coarse aggregate indices to neighbors:
+  //
+  viennacl::ocl::kernel & propagate_coarse_indices = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_agg_propagate_coarse_indices");
+  viennacl::ocl::enqueue(propagate_coarse_indices(amg_context.point_types_,
+                                                  amg_context.coarse_id_,
+                                                  amg_context.influence_jumper_,
+                                                  amg_context.influence_ids_,
+                                                  cl_uint(A.size1())
+                                                 )
+                        );
+
+  //
+  // Stage 3: Merge remaining undecided points (merging to first aggregate found when cycling over the hierarchy
+  //
+  viennacl::ocl::kernel & merge_undecided = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_agg_merge_undecided");
+  viennacl::ocl::enqueue(merge_undecided(amg_context.point_types_,
+                                         amg_context.coarse_id_,
+                                         amg_context.influence_jumper_,
+                                         amg_context.influence_ids_,
+                                         cl_uint(A.size1())
+                                        )
+                         );
+
+  //
+  // Stage 4: Set undecided points to fine points (coarse ID already set in Stage 3)
+  //          Note: Stage 3 and Stage 4 were initially fused, but are now split in order to avoid race conditions (or a fallback to sequential execution).
+  //
+  viennacl::ocl::kernel & merge_undecided_2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_agg_merge_undecided_2");
+  viennacl::ocl::enqueue(merge_undecided_2(amg_context.point_types_, cl_uint(A.size1()) ) );
+
+}
+
+
+
+
+/** @brief Calls the right coarsening procedure
+*
+* @param A            Operator matrix on all levels
+* @param amg_context  AMG hierarchy datastructures
+* @param tag          AMG preconditioner tag
+*/
+template<typename InternalT1>
+void amg_coarse(InternalT1 & A,
+                viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                viennacl::linalg::amg_tag & tag)
+{
+  switch (tag.get_coarsening_method())
+  {
+  case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: amg_coarse_ag(A, amg_context, tag); break;
+  default: throw std::runtime_error("not implemented yet");
+  }
+}
+
+
+
+
+////////////////////////////////////// Interpolation /////////////////////////////
+
+
+/** @brief AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA)
+ *
+ * @param A            Operator matrix
+ * @param P            Prolongation matrix
+ * @param amg_context  AMG hierarchy datastructures
+ * @param tag          AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_ag(compressed_matrix<NumericT> const & A,
+                     compressed_matrix<NumericT> & P,
+                     viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                     viennacl::linalg::amg_tag & tag)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+
+  (void)tag;
+  P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+  // build matrix here
+  viennacl::ocl::kernel & interpolate_ag = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_interpol_ag");
+  viennacl::ocl::enqueue(interpolate_ag(P.handle1().opencl_handle(),
+                                        P.handle2().opencl_handle(),
+                                        P.handle().opencl_handle(),
+                                        amg_context.coarse_id_,
+                                        cl_uint(A.size1())
+                                        )
+                         );
+
+  P.generate_row_block_information();
+}
+
+/** @brief Smoothed aggregation interpolation. (VIENNACL_INTERPOL_SA)
+ *
+ * @param A            Operator matrix
+ * @param P            Prolongation matrix
+ * @param amg_context  AMG hierarchy datastructures
+ * @param tag          AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_sa(compressed_matrix<NumericT> const & A,
+                     compressed_matrix<NumericT> & P,
+                     viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                     viennacl::linalg::amg_tag & tag)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+
+  (void)tag;
+  viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+  // form tentative operator:
+  amg_interpol_ag(A, P_tentative, amg_context, tag);
+
+  viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), viennacl::traits::context(A));
+
+  viennacl::ocl::kernel & interpol_sa = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_interpol_sa");
+  viennacl::ocl::enqueue(interpol_sa(A.handle1().opencl_handle(),
+                                     A.handle2().opencl_handle(),
+                                     A.handle().opencl_handle(),
+                                     cl_uint(A.size1()),
+                                     cl_uint(A.nnz()),
+                                     Jacobi.handle1().opencl_handle(),
+                                     Jacobi.handle2().opencl_handle(),
+                                     Jacobi.handle().opencl_handle(),
+                                     NumericT(tag.get_jacobi_weight())
+                                    )
+                         );
+
+  P = viennacl::linalg::prod(Jacobi, P_tentative);
+
+  P.generate_row_block_information();
+}
+
+/** @brief Dispatcher for building the interpolation matrix
+ *
+ * @param A            Operator matrix
+ * @param P            Prolongation matrix
+ * @param amg_context  AMG hierarchy datastructures
+ * @param tag          AMG configuration tag
+*/
+template<typename MatrixT>
+void amg_interpol(MatrixT const & A,
+                  MatrixT & P,
+                  viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                  viennacl::linalg::amg_tag & tag)
+{
+  switch (tag.get_interpolation_method())
+  {
+  case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION:           amg_interpol_ag     (A, P, amg_context, tag); break;
+  case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION:  amg_interpol_sa     (A, P, amg_context, tag); break;
+  default: throw std::runtime_error("Not implemented yet!");
+  }
+}
+
+/** Assign sparse matrix A to dense matrix B */
+template<typename NumericT, unsigned int AlignmentV>
+void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+                     viennacl::matrix_base<NumericT> & B)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(),
+                                             "assign_to_dense");
+
+  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(B),
+                           cl_uint(viennacl::traits::start1(B)),         cl_uint(viennacl::traits::start2(B)),
+                           cl_uint(viennacl::traits::stride1(B)),        cl_uint(viennacl::traits::stride2(B)),
+                           cl_uint(viennacl::traits::size1(B)),          cl_uint(viennacl::traits::size2(B)),
+                           cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B)) ));
+
+}
+
+/** @brief Jacobi Smoother (OpenCL version)
+*
+* @param iterations  Number of smoother iterations
+* @param A           Operator matrix for the smoothing
+* @param x           The vector smoothing is applied to
+* @param x_backup    (Different) Vector holding the same values as x
+* @param rhs_smooth  The right hand side of the equation for the smoother
+* @param weight      Damping factor. 0: No effect of smoother. 1: Undamped Jacobi iteration
+*/
+template<typename NumericT>
+void smooth_jacobi(unsigned int iterations,
+                   compressed_matrix<NumericT> const & A,
+                   vector<NumericT> & x,
+                   vector<NumericT> & x_backup,
+                   vector<NumericT> const & rhs_smooth,
+                   NumericT weight)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+  viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "jacobi");
+
+  for (unsigned int i=0; i<iterations; ++i)
+  {
+    x_backup = x;
+
+    viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+                            static_cast<NumericT>(weight),
+                            viennacl::traits::opencl_handle(x_backup),
+                            viennacl::traits::opencl_handle(x),
+                            viennacl::traits::opencl_handle(rhs_smooth),
+                            static_cast<cl_uint>(rhs_smooth.size())));
+
+  }
+}
+
+
+} //namespace amg
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp
new file mode 100644
index 0000000..2fcd6fa
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp
@@ -0,0 +1,177 @@
+#ifndef VIENNACL_LINALG_OPENCL_BISECT_KERNEL_CALLS_HPP_
+#define VIENNACL_LINALG_OPENCL_BISECT_KERNEL_CALLS_HPP_
+
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/opencl/bisect_kernel_calls.hpp
+    @brief OpenCL kernel calls for the bisection algorithm
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, project
+#include "viennacl/linalg/opencl/kernels/bisect.hpp"
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+const std::string BISECT_KERNEL_SMALL = "bisectKernelSmall";
+const std::string BISECT_KERNEL_LARGE = "bisectKernelLarge";
+const std::string BISECT_KERNEL_LARGE_ONE_INTERVALS  = "bisectKernelLarge_OneIntervals";
+const std::string BISECT_KERNEL_LARGE_MULT_INTERVALS = "bisectKernelLarge_MultIntervals";
+
+template<typename NumericT>
+void bisectSmall(const viennacl::linalg::detail::InputData<NumericT> &input,
+                         viennacl::linalg::detail::ResultDataSmall<NumericT> &result,
+                         const unsigned int mat_size,
+                         const NumericT lg, const NumericT ug,
+                         const NumericT precision)
+    {
+      viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context());
+      viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx);
+
+      viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_SMALL);
+      kernel.global_work_size(0, 1 * VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX);
+      kernel.local_work_size(0, VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX);
+
+      viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a),
+                                    viennacl::traits::opencl_handle(input.g_b),
+                                    static_cast<cl_uint>(mat_size),
+                                    viennacl::traits::opencl_handle(result.vcl_g_left),
+                                    viennacl::traits::opencl_handle(result.vcl_g_right),
+                                    viennacl::traits::opencl_handle(result.vcl_g_left_count),
+                                    viennacl::traits::opencl_handle(result.vcl_g_right_count),
+                                    static_cast<NumericT>(lg),
+                                    static_cast<NumericT>(ug),
+                                    static_cast<cl_uint>(0),
+                                    static_cast<cl_uint>(mat_size),
+                                    static_cast<NumericT>(precision)
+                            ));
+
+    }
+
+template<typename NumericT>
+void bisectLarge(const viennacl::linalg::detail::InputData<NumericT> &input,
+                 viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+                 const unsigned int mat_size,
+                 const NumericT lg, const NumericT ug,
+                 const NumericT precision)
+    {
+      viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context());
+      viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx);
+
+      viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_LARGE);
+      kernel.global_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);     // Use only 128 threads for 256 < n <= 512, this
+      kernel.local_work_size(0,  mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);     // is reasoned
+
+      viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a),
+                                    viennacl::traits::opencl_handle(input.g_b),
+                                    static_cast<cl_uint>(mat_size),
+                                    static_cast<NumericT>(lg),
+                                    static_cast<NumericT>(ug),
+                                    static_cast<cl_uint>(0),
+                                    static_cast<cl_uint>(mat_size),
+                                    static_cast<NumericT>(precision),
+                                    viennacl::traits::opencl_handle(result.g_num_one),
+                                    viennacl::traits::opencl_handle(result.g_num_blocks_mult),
+                                    viennacl::traits::opencl_handle(result.g_left_one),
+                                    viennacl::traits::opencl_handle(result.g_right_one),
+                                    viennacl::traits::opencl_handle(result.g_pos_one),
+                                    viennacl::traits::opencl_handle(result.g_left_mult),
+                                    viennacl::traits::opencl_handle(result.g_right_mult),
+                                    viennacl::traits::opencl_handle(result.g_left_count_mult),
+                                    viennacl::traits::opencl_handle(result.g_right_count_mult),
+                                    viennacl::traits::opencl_handle(result.g_blocks_mult),
+                                    viennacl::traits::opencl_handle(result.g_blocks_mult_sum)
+                            ));
+
+    }
+
+template<typename NumericT>
+void bisectLargeOneIntervals(const viennacl::linalg::detail::InputData<NumericT> &input,
+                             viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+                             const unsigned int mat_size,
+                             const NumericT precision)
+    {
+      unsigned int num_one_intervals = result.g_num_one;
+      unsigned int num_blocks = viennacl::linalg::detail::getNumBlocksLinear(num_one_intervals,
+                                                                             mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK: VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);
+
+      viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context());
+      viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx);
+
+      viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_LARGE_ONE_INTERVALS);
+      kernel.global_work_size(0, num_blocks * (mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2));
+      kernel.local_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);
+
+      viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a),
+                                    viennacl::traits::opencl_handle(input.g_b),
+                                    static_cast<cl_uint>(mat_size),
+                                    static_cast<cl_uint>(num_one_intervals),
+                                    viennacl::traits::opencl_handle(result.g_left_one),
+                                    viennacl::traits::opencl_handle(result.g_right_one),
+                                    viennacl::traits::opencl_handle(result.g_pos_one),
+                                    static_cast<NumericT>(precision)
+                            ));
+    }
+
+
+template<typename NumericT>
+void bisectLargeMultIntervals(const viennacl::linalg::detail::InputData<NumericT> &input,
+                              viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+                              const unsigned int mat_size,
+                              const NumericT precision)
+    {
+      unsigned int  num_blocks_mult = result.g_num_blocks_mult;
+
+      viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context());
+      viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx);
+
+      viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_LARGE_MULT_INTERVALS);
+      kernel.global_work_size(0, num_blocks_mult * (mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2));
+      kernel.local_work_size(0,                     mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);
+
+      viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a),
+                                    viennacl::traits::opencl_handle(input.g_b),
+                                    static_cast<cl_uint>(mat_size),
+                                    viennacl::traits::opencl_handle(result.g_blocks_mult),
+                                    viennacl::traits::opencl_handle(result.g_blocks_mult_sum),
+                                    viennacl::traits::opencl_handle(result.g_left_mult),
+                                    viennacl::traits::opencl_handle(result.g_right_mult),
+                                    viennacl::traits::opencl_handle(result.g_left_count_mult),
+                                    viennacl::traits::opencl_handle(result.g_right_count_mult),
+                                    viennacl::traits::opencl_handle(result.g_lambda_mult),
+                                    viennacl::traits::opencl_handle(result.g_pos_mult),
+                                    static_cast<NumericT>(precision)
+                            ));
+    }
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp
new file mode 100644
index 0000000..d6a288b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp
@@ -0,0 +1,102 @@
+#ifndef VIENNACL_LINALG_OPENCL_COMMON_HPP_
+#define VIENNACL_LINALG_OPENCL_COMMON_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/common.hpp
+    @brief Common implementations shared by OpenCL-based operations
+*/
+
+#include <cmath>
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/traits/handle.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace detail
+{
+
+
+
+inline cl_uint make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
+{
+  return static_cast<cl_uint>( ((length > 1) ? (cl_uint(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) );
+}
+
+
+/** @brief Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices. */
+inline std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
+{
+  if (B_transposed)
+  {
+    if (B_row_major && C_row_major)
+      return "trans_mat_mult_row_row";
+    if (B_row_major && !C_row_major)
+      return "trans_mat_mult_row_col";
+    if (!B_row_major && C_row_major)
+      return "trans_mat_mult_col_row";
+
+    return "trans_mat_mult_col_col";
+  }
+
+  if (B_row_major && C_row_major)
+    return "mat_mult_row_row";
+  if (B_row_major && !C_row_major)
+    return "mat_mult_row_col";
+  if (!B_row_major && C_row_major)
+    return "mat_mult_col_row";
+
+  return "mat_mult_col_col";
+}
+
+
+
+template<typename SomeT>
+ocl::device const & current_device(SomeT const & obj) {  return traits::opencl_handle(obj).context().current_device(); }
+
+inline std::string op_to_string(op_abs)   { return "abs";   }
+inline std::string op_to_string(op_acos)  { return "acos";  }
+inline std::string op_to_string(op_asin)  { return "asin";  }
+inline std::string op_to_string(op_atan)  { return "atan";  }
+inline std::string op_to_string(op_ceil)  { return "ceil";  }
+inline std::string op_to_string(op_cos)   { return "cos";   }
+inline std::string op_to_string(op_cosh)  { return "cosh";  }
+inline std::string op_to_string(op_exp)   { return "exp";   }
+inline std::string op_to_string(op_fabs)  { return "fabs";  }
+inline std::string op_to_string(op_floor) { return "floor"; }
+inline std::string op_to_string(op_log)   { return "log";   }
+inline std::string op_to_string(op_log10) { return "log10"; }
+inline std::string op_to_string(op_sin)   { return "sin";   }
+inline std::string op_to_string(op_sinh)  { return "sinh";  }
+inline std::string op_to_string(op_sqrt)  { return "sqrt";  }
+inline std::string op_to_string(op_tan)   { return "tan";   }
+inline std::string op_to_string(op_tanh)  { return "tanh";  }
+
+} //namespace detail
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp
new file mode 100644
index 0000000..76874b1
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp
@@ -0,0 +1,153 @@
+#ifndef VIENNACL_LINALG_OPENCL_DIRECT_SOLVE_HPP
+#define VIENNACL_LINALG_OPENCL_DIRECT_SOLVE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/direct_solve.hpp
+    @brief Implementations of dense direct solvers are found here.
+*/
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix_solve.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+namespace detail
+{
+  inline cl_uint get_option_for_solver_tag(viennacl::linalg::upper_tag)      { return 0; }
+  inline cl_uint get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); }
+  inline cl_uint get_option_for_solver_tag(viennacl::linalg::lower_tag)      { return (1 << 2); }
+  inline cl_uint get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); }
+
+  template<typename MatrixT1, typename MatrixT2, typename KernelT>
+  void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, KernelT & k)
+  {
+    viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+                             cl_uint(viennacl::traits::start1(A)),         cl_uint(viennacl::traits::start2(A)),
+                             cl_uint(viennacl::traits::stride1(A)),        cl_uint(viennacl::traits::stride2(A)),
+                             cl_uint(viennacl::traits::size1(A)),          cl_uint(viennacl::traits::size2(A)),
+                             cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)),
+                             viennacl::traits::opencl_handle(B),
+                             cl_uint(viennacl::traits::start1(B)),         cl_uint(viennacl::traits::start2(B)),
+                             cl_uint(viennacl::traits::stride1(B)),        cl_uint(viennacl::traits::stride2(B)),
+                             cl_uint(viennacl::traits::size1(B)),          cl_uint(viennacl::traits::size2(B)),
+                             cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B))
+                            )
+                          );
+  }
+}
+
+
+//
+// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
+//
+
+////////////////// upper triangular solver (upper_tag) //////////////////////////////////////
+/** @brief Direct inplace solver for dense triangular systems. Matlab notation: A \ B
+*
+* @param A    The system matrix
+* @param B    The matrix of row vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & A,
+                   matrix_base<NumericT> & B,
+                   SolverTagT)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+  std::string program_name;
+  if (A.row_major() && B.row_major())
+  {
+    typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, row_major, row_major>    KernelClass;
+    KernelClass::init(ctx);
+    program_name = KernelClass::program_name();
+  }
+  else if (A.row_major() && !B.row_major())
+  {
+    typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, row_major, column_major>    KernelClass;
+    KernelClass::init(ctx);
+    program_name = KernelClass::program_name();
+  }
+  else if (!A.row_major() && B.row_major())
+  {
+    typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, column_major, row_major>    KernelClass;
+    KernelClass::init(ctx);
+    program_name = KernelClass::program_name();
+  }
+  else
+  {
+    typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, column_major, column_major>    KernelClass;
+    KernelClass::init(ctx);
+    program_name = KernelClass::program_name();
+  }
+
+  std::stringstream ss;
+  ss << SolverTagT::name();
+  ss << "_solve";
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(program_name, ss.str());
+
+  k.global_work_size(0, B.size2() * k.local_work_size());
+  detail::inplace_solve_impl(A, B, k);
+}
+
+
+
+//
+//  Solve on vector
+//
+
+template<typename NumericT, typename SOLVERTAG>
+void inplace_solve(matrix_base<NumericT> const & A,
+                   vector_base<NumericT>       & x,
+                   SOLVERTAG)
+{
+  cl_uint options = detail::get_option_for_solver_tag(SOLVERTAG());
+
+  viennacl::ocl::kernel & k = detail::legacy_kernel_for_matrix(A,  "triangular_substitute_inplace");
+
+  k.global_work_size(0, k.local_work_size());
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+                           cl_uint(viennacl::traits::start1(A)),         cl_uint(viennacl::traits::start2(A)),
+                           cl_uint(viennacl::traits::stride1(A)),        cl_uint(viennacl::traits::stride2(A)),
+                           cl_uint(viennacl::traits::size1(A)),          cl_uint(viennacl::traits::size2(A)),
+                           cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)),
+                           viennacl::traits::opencl_handle(x),
+                           cl_uint(viennacl::traits::start(x)),
+                           cl_uint(viennacl::traits::stride(x)),
+                           cl_uint(viennacl::traits::size(x)),
+                           options
+                          )
+                        );
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp
new file mode 100644
index 0000000..a7b12b3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp
@@ -0,0 +1,350 @@
+#ifndef VIENNACL_LINALG_OPENCL_FFT_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_FFT_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+   Institute for Analysis and Scientific Computing,
+   TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+   -----------------
+   ViennaCL - The Vienna Computing Library
+   -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+
+/** @file viennacl/linalg/opencl/fft_operations.hpp
+ @brief Implementations of Fast Furier Transformation using OpenCL
+ */
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/fft_operations.hpp"
+#include "viennacl/linalg/opencl/kernels/fft.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+
+#include <viennacl/vector.hpp>
+#include <viennacl/matrix.hpp>
+
+#include <cmath>
+#include <stdexcept>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace fft
+{
+
+  const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
+
+  /**
+   * @brief Get number of bits
+   */
+  inline vcl_size_t num_bits(vcl_size_t size)
+  {
+    vcl_size_t bits_datasize = 0;
+    vcl_size_t ds = 1;
+
+    while (ds < size)
+    {
+      ds = ds << 1;
+      bits_datasize++;
+    }
+
+    return bits_datasize;
+  }
+
+  /**
+   * @brief Find next power of two
+   */
+  inline vcl_size_t next_power_2(vcl_size_t n)
+  {
+    n = n - 1;
+
+    vcl_size_t power = 1;
+
+    while (power < sizeof(vcl_size_t) * 8)
+    {
+      n = n | (n >> power);
+      power *= 2;
+    }
+
+    return n + 1;
+  }
+
+} //namespce fft
+} //namespace detail
+
+namespace opencl
+{
+
+/**
+ * @brief Direct algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT>
+void direct(viennacl::ocl::handle<cl_mem> const & in,
+            viennacl::ocl::handle<cl_mem> const & out,
+            vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context());
+  viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+  std::string program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::program_name();
+  if (data_order == viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR)
+  {
+    viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::init(ctx);
+    program_string =
+        viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::program_name();
+  } else
+    viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(program_string, "fft_direct");
+  viennacl::ocl::enqueue(k(in, out,
+                           static_cast<cl_uint>(size),
+                           static_cast<cl_uint>(stride),
+                           static_cast<cl_uint>(batch_num),
+                           sign)
+                        );
+}
+
+/*
+ * This function performs reorder of input data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT>
+void reorder(viennacl::ocl::handle<cl_mem> const & in,
+             vcl_size_t size, vcl_size_t stride,
+             vcl_size_t bits_datasize, vcl_size_t batch_num,
+             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context());
+  viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+  std::string program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::program_name();
+  if (data_order == viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR)
+  {
+    viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::init(ctx);
+    program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::program_name();
+  } else
+    viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::init(ctx);
+
+  viennacl::ocl::kernel& k = ctx.get_kernel(program_string, "fft_reorder");
+  viennacl::ocl::enqueue(k(in,
+                           static_cast<cl_uint>(bits_datasize), static_cast<cl_uint>(size),
+                           static_cast<cl_uint>(stride), static_cast<cl_uint>(batch_num))
+                        );
+}
+
+/**
+ * @brief Radix-2 algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT>
+void radix2(viennacl::ocl::handle<cl_mem> const & in,
+            vcl_size_t size, vcl_size_t stride,
+            vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context());
+  viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+  assert(batch_num != 0 && bool("batch_num must be larger than 0"));
+
+  std::string program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::program_name();
+  if (data_order == viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR)
+  {
+    viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::init(ctx);
+    program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::program_name();
+  } else
+    viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::init(ctx);
+
+  vcl_size_t bits_datasize = viennacl::linalg::detail::fft::num_bits(size);
+  if (size <= viennacl::linalg::detail::fft::MAX_LOCAL_POINTS_NUM)
+  {
+    viennacl::ocl::kernel & k = ctx.get_kernel(program_string, "fft_radix2_local");
+    viennacl::ocl::enqueue(k(in,
+                             viennacl::ocl::local_mem((size * 4) * sizeof(NumericT)),
+                             static_cast<cl_uint>(bits_datasize), static_cast<cl_uint>(size),
+                             static_cast<cl_uint>(stride), static_cast<cl_uint>(batch_num), sign));
+
+  }
+  else
+  {
+    viennacl::linalg::opencl::reorder<NumericT>(in, size, stride, bits_datasize, batch_num);
+
+    for (vcl_size_t step = 0; step < bits_datasize; step++)
+    {
+      viennacl::ocl::kernel & k = ctx.get_kernel(program_string, "fft_radix2");
+      viennacl::ocl::enqueue(k(in,
+                               static_cast<cl_uint>(step), static_cast<cl_uint>(bits_datasize),
+                               static_cast<cl_uint>(size), static_cast<cl_uint>(stride),
+                               static_cast<cl_uint>(batch_num), sign));
+    }
+  }
+}
+
+/**
+ * @brief Bluestein's algorithm for computing Fourier transformation.
+ *
+ * Currently,  Works only for sizes of input data which less than 2^16.
+ * Uses a lot of additional memory, but should be fast for any size of data.
+ * Serial implementation has something about o(n * lg n) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void bluestein(viennacl::vector<NumericT, AlignmentV>& in,
+               viennacl::vector<NumericT, AlignmentV>& out, vcl_size_t /*batch_num*/)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+  viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+  vcl_size_t size = in.size() >> 1;
+  vcl_size_t ext_size = viennacl::linalg::detail::fft::next_power_2(2 * size - 1);
+
+  viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
+  viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
+  viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
+
+  {
+    viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "zero2");
+    viennacl::ocl::enqueue(k(A, B, static_cast<cl_uint>(ext_size)));
+  }
+  {
+    viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "bluestein_pre");
+    viennacl::ocl::enqueue(k(in, A, B, static_cast<cl_uint>(size), static_cast<cl_uint>(ext_size)));
+  }
+
+  viennacl::linalg::convolve_i(A, B, Z);
+
+  {
+    viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "bluestein_post");
+    viennacl::ocl::enqueue(k(Z, out, static_cast<cl_uint>(size)));
+  }
+}
+
+/**
+ * @brief Mutiply two complex vectors and store result in output
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
+                      viennacl::vector<NumericT, AlignmentV> const & input2,
+                      viennacl::vector<NumericT, AlignmentV>       & output)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input1).context());
+  viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+  vcl_size_t size = input1.size() >> 1;
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "fft_mult_vec");
+  viennacl::ocl::enqueue(k(input1, input2, output, static_cast<cl_uint>(size)));
+}
+
+/**
+ * @brief Normalize vector on with his own size
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void normalize(viennacl::vector<NumericT, AlignmentV> & input)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+  viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "fft_div_vec_scalar");
+
+  vcl_size_t size = input.size() >> 1;
+  NumericT norm_factor = static_cast<NumericT>(size);
+  viennacl::ocl::enqueue(k(input, static_cast<cl_uint>(size), norm_factor));
+}
+
+/**
+ * @brief Inplace_transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+  viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "transpose_inplace");
+  viennacl::ocl::enqueue(k(input, static_cast<cl_uint>(input.internal_size1() >> 1),
+                           static_cast<cl_uint>(input.internal_size2()) >> 1));
+}
+
+/**
+ * @brief Transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
+               viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+  viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "transpose");
+  viennacl::ocl::enqueue(k(input, output, static_cast<cl_uint>(input.internal_size1() >> 1),
+                           static_cast<cl_uint>(input.internal_size2() >> 1)));
+}
+
+/**
+ * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void real_to_complex(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT>       & out, vcl_size_t size)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+  viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "real_to_complex");
+  viennacl::ocl::enqueue(k(in, out, static_cast<cl_uint>(size)));
+}
+
+/**
+ * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void complex_to_real(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT>       & out, vcl_size_t size)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+  viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "complex_to_real");
+  viennacl::ocl::enqueue(k(in, out, static_cast<cl_uint>(size)));
+}
+
+/**
+ * @brief Reverse vector to oposite order and save it in input vector
+ */
+template<typename NumericT>
+void reverse(viennacl::vector_base<NumericT>& in)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+  viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+  vcl_size_t size = in.size();
+
+  viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "reverse_inplace");
+  viennacl::ocl::enqueue(k(in, static_cast<cl_uint>(size)));
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* FFT_OPERATIONS_HPP_ */
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp
new file mode 100644
index 0000000..248a88a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp
@@ -0,0 +1,260 @@
+#ifndef VIENNACL_LINALG_OPENCL_ILU_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_ILU_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/ilu_operations.hpp
+    @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using OpenCL
+*/
+
+#include <cmath>
+#include <algorithm>  //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/ilu.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+/////////////////////// ICC /////////////////////
+
+template<typename NumericT>
+void extract_L(compressed_matrix<NumericT> const & A,
+                compressed_matrix<NumericT>       & L)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+  //
+  // Step 1: Count elements in L:
+  //
+  viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_L_1");
+
+  viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
+                            L.handle1().opencl_handle())
+                        );
+
+  //
+  // Step 2: Exclusive scan on row_buffers:
+  //
+  viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), A.size1() + 1, 0, 1);
+  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+  L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+
+  //
+  // Step 3: Write entries
+  //
+  viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_L_2");
+
+  viennacl::ocl::enqueue(k2(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()),
+                            L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle())
+                        );
+
+  L.generate_row_block_information();
+
+} // extract_LU
+
+///////////////////////////////////////////////
+
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void icc_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L)
+{
+  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+  // fill D:
+  viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_1");
+  viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+  // scale L:
+  viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_2");
+  viennacl::ocl::enqueue(k2(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+}
+
+/////////////////////////////////////
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenCL (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void icc_chow_patel_sweep(compressed_matrix<NumericT>       & L,
+                          vector<NumericT>            const & aij_L)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
+  viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+  viennacl::backend::mem_handle L_backup;
+  viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+  viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "icc_chow_patel_sweep_kernel");
+  viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), L_backup.opencl_handle(), cl_uint(L.size1()),
+                           aij_L)
+                        );
+
+}
+
+
+/////////////////////// ILU /////////////////////
+
+template<typename NumericT>
+void extract_LU(compressed_matrix<NumericT> const & A,
+                compressed_matrix<NumericT>       & L,
+                compressed_matrix<NumericT>       & U)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+  //
+  // Step 1: Count elements in L and U:
+  //
+  viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_LU_1");
+
+  viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
+                            L.handle1().opencl_handle(),
+                            U.handle1().opencl_handle())
+                        );
+
+  //
+  // Step 2: Exclusive scan on row_buffers:
+  //
+  viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), A.size1() + 1, 0, 1);
+  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+  L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+  viennacl::vector_base<unsigned int> wrapped_U_row_buffer(U.handle1(), A.size1() + 1, 0, 1);
+  viennacl::linalg::exclusive_scan(wrapped_U_row_buffer, wrapped_U_row_buffer);
+  U.reserve(wrapped_U_row_buffer[U.size1()], false);
+
+  //
+  // Step 3: Write entries
+  //
+  viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_LU_2");
+
+  viennacl::ocl::enqueue(k2(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()),
+                            L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
+                            U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle())
+                        );
+
+  L.generate_row_block_information();
+  // Note: block information for U will be generated after transposition
+
+} // extract_LU
+
+///////////////////////////////////////////////
+
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void ilu_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L,
+               compressed_matrix<NumericT>       & U)
+{
+  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+  // fill D:
+  viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_1");
+  viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+  // scale L:
+  viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_2");
+  viennacl::ocl::enqueue(k2(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+  // scale U:
+  viennacl::ocl::enqueue(k2(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+}
+
+/////////////////////////////////////
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenCL (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void ilu_chow_patel_sweep(compressed_matrix<NumericT>       & L,
+                          vector<NumericT>            const & aij_L,
+                          compressed_matrix<NumericT>       & U_trans,
+                          vector<NumericT>            const & aij_U_trans)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
+  viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+  viennacl::backend::mem_handle L_backup;
+  viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+  viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+  viennacl::backend::mem_handle U_backup;
+  viennacl::backend::memory_create(U_backup, U_trans.handle().raw_size(), viennacl::traits::context(U_trans));
+  viennacl::backend::memory_copy(U_trans.handle(), U_backup, 0, 0, U_trans.handle().raw_size());
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_chow_patel_sweep_kernel");
+  viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), L_backup.opencl_handle(), cl_uint(L.size1()),
+                           aij_L,
+                           U_trans.handle1().opencl_handle(), U_trans.handle2().opencl_handle(), U_trans.handle().opencl_handle(), U_backup.opencl_handle(),
+                           aij_U_trans)
+                        );
+
+}
+
+//////////////////////////////////////
+
+
+
+template<typename NumericT>
+void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
+                             vector<NumericT> & diag_R)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(R).context());
+  viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_form_neumann_matrix_kernel");
+  viennacl::ocl::enqueue(k(R.handle1().opencl_handle(), R.handle2().opencl_handle(), R.handle().opencl_handle(), cl_uint(R.size1()),
+                           diag_R)
+                        );
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif


[48/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cu
new file mode 100644
index 0000000..20c4994
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cu
@@ -0,0 +1,219 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemv(ViennaCLBackend backend,
+                                                              ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt m, ViennaCLInt n, float alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              float beta,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                   size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                   size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  v2 *= beta;
+  if (transA == ViennaCLTrans)
+    v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+  else
+    v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemv(ViennaCLBackend backend,
+                                                              ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt m, ViennaCLInt n, double alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              double beta,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                    size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                    size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  v2 *= beta;
+  if (transA == ViennaCLTrans)
+    v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+  else
+    v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLStrsv(ViennaCLBackend backend,
+                                                              ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                              ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+
+  viennacl::vector_base<float> v(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                   size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+                                   size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  if (transA == ViennaCLTrans)
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+  }
+  else
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDtrsv(ViennaCLBackend backend,
+                                                              ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                              ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+
+  viennacl::vector_base<double> v(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                    size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+                                    size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+  if (transA == ViennaCLTrans)
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+  }
+  else
+  {
+    if (uplo == ViennaCLUpper)
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+    else
+      if (diag == ViennaCLUnit)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSger(ViennaCLBackend backend,
+                                                             ViennaCLOrder order,
+                                                             ViennaCLInt m, ViennaCLInt n,
+                                                             float alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                   size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                   size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+  mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDger(ViennaCLBackend backend,
+                                                             ViennaCLOrder order,
+                                                             ViennaCLInt m, ViennaCLInt n,
+                                                             double alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                    size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+                                    size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+  mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+  return ViennaCLSuccess;
+}
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cpp
new file mode 100644
index 0000000..bb6e03e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cpp
@@ -0,0 +1,272 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_matrix.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// GEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemm(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLMatrix B, ViennaCLHostScalar beta, ViennaCLMatrix C)
+{
+  viennacl::backend::mem_handle A_handle;
+  viennacl::backend::mem_handle B_handle;
+  viennacl::backend::mem_handle C_handle;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(B_handle, B) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(C_handle, C) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (A->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::matrix_base<float>::size_type           size_type;
+      typedef viennacl::matrix_base<float>::size_type           difference_type;
+
+      viennacl::matrix_base<float> mat_A(A_handle,
+                                         size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                         size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      viennacl::matrix_base<float> mat_B(B_handle,
+                                         size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+                                         size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+      viennacl::matrix_base<float> mat_C(C_handle,
+                                         size_type(C->size1), size_type(C->start1), difference_type(C->stride1), size_type(C->internal_size1),
+                                         size_type(C->size2), size_type(C->start2), difference_type(C->stride2), size_type(C->internal_size2), C->order == ViennaCLRowMajor);
+
+      if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+      else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+      else
+        return ViennaCLGenericFailure;
+
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::matrix_base<double>::size_type           size_type;
+      typedef viennacl::matrix_base<double>::size_type           difference_type;
+
+      viennacl::matrix_base<double> mat_A(A_handle,
+                                          size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                          size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      viennacl::matrix_base<double> mat_B(B_handle,
+                                          size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+                                          size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+      viennacl::matrix_base<double> mat_C(C_handle,
+                                          size_type(C->size1), size_type(C->start1), difference_type(C->stride1), size_type(C->internal_size1),
+                                          size_type(C->size2), size_type(C->start2), difference_type(C->stride2), size_type(C->internal_size2), C->order == ViennaCLRowMajor);
+
+      if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+      else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+      else
+        return ViennaCLGenericFailure;
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsm(ViennaCLMatrix A, ViennaCLUplo uplo, ViennaCLDiag diag, ViennaCLMatrix B)
+{
+  viennacl::backend::mem_handle A_handle;
+  viennacl::backend::mem_handle B_handle;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(B_handle, B) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (A->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::matrix_base<float>::size_type           size_type;
+      typedef viennacl::matrix_base<float>::size_type           difference_type;
+
+      viennacl::matrix_base<float> mat_A(A_handle,
+                                         size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                         size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      viennacl::matrix_base<float> mat_B(B_handle,
+                                         size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+                                         size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+
+      if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+
+      return ViennaCLSuccess;
+    }
+    case ViennaCLDouble:
+    {
+      typedef viennacl::matrix_base<double>::size_type           size_type;
+      typedef viennacl::matrix_base<double>::size_type           difference_type;
+
+      viennacl::matrix_base<double> mat_A(A_handle,
+                                          size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                          size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      viennacl::matrix_base<double> mat_B(B_handle,
+                                          size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+                                          size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+
+      if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return  ViennaCLGenericFailure;
+  }
+}
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cu
new file mode 100644
index 0000000..bb6e03e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cu
@@ -0,0 +1,272 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_matrix.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// GEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemm(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLMatrix B, ViennaCLHostScalar beta, ViennaCLMatrix C)
+{
+  viennacl::backend::mem_handle A_handle;
+  viennacl::backend::mem_handle B_handle;
+  viennacl::backend::mem_handle C_handle;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(B_handle, B) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(C_handle, C) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (A->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::matrix_base<float>::size_type           size_type;
+      typedef viennacl::matrix_base<float>::size_type           difference_type;
+
+      viennacl::matrix_base<float> mat_A(A_handle,
+                                         size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                         size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      viennacl::matrix_base<float> mat_B(B_handle,
+                                         size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+                                         size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+      viennacl::matrix_base<float> mat_C(C_handle,
+                                         size_type(C->size1), size_type(C->start1), difference_type(C->stride1), size_type(C->internal_size1),
+                                         size_type(C->size2), size_type(C->start2), difference_type(C->stride2), size_type(C->internal_size2), C->order == ViennaCLRowMajor);
+
+      if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+      else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+      else
+        return ViennaCLGenericFailure;
+
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::matrix_base<double>::size_type           size_type;
+      typedef viennacl::matrix_base<double>::size_type           difference_type;
+
+      viennacl::matrix_base<double> mat_A(A_handle,
+                                          size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                          size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      viennacl::matrix_base<double> mat_B(B_handle,
+                                          size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+                                          size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+      viennacl::matrix_base<double> mat_C(C_handle,
+                                          size_type(C->size1), size_type(C->start1), difference_type(C->stride1), size_type(C->internal_size1),
+                                          size_type(C->size2), size_type(C->start2), difference_type(C->stride2), size_type(C->internal_size2), C->order == ViennaCLRowMajor);
+
+      if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+      else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+      else
+        return ViennaCLGenericFailure;
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsm(ViennaCLMatrix A, ViennaCLUplo uplo, ViennaCLDiag diag, ViennaCLMatrix B)
+{
+  viennacl::backend::mem_handle A_handle;
+  viennacl::backend::mem_handle B_handle;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(B_handle, B) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (A->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::matrix_base<float>::size_type           size_type;
+      typedef viennacl::matrix_base<float>::size_type           difference_type;
+
+      viennacl::matrix_base<float> mat_A(A_handle,
+                                         size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                         size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      viennacl::matrix_base<float> mat_B(B_handle,
+                                         size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+                                         size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+
+      if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+
+      return ViennaCLSuccess;
+    }
+    case ViennaCLDouble:
+    {
+      typedef viennacl::matrix_base<double>::size_type           size_type;
+      typedef viennacl::matrix_base<double>::size_type           difference_type;
+
+      viennacl::matrix_base<double> mat_A(A_handle,
+                                          size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+                                          size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+      viennacl::matrix_base<double> mat_B(B_handle,
+                                          size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+                                          size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+
+      if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+      {
+        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+        else
+          return ViennaCLGenericFailure;
+      }
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return  ViennaCLGenericFailure;
+  }
+}
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3.hpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas3.hpp
new file mode 100644
index 0000000..cfcc034
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3.hpp
@@ -0,0 +1,60 @@
+#ifndef VIENNACL_SRC_BLAS3_HPP
+#define VIENNACL_SRC_BLAS3_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+namespace detail
+{
+  template <typename ScalarType, typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC>
+  void gemm_dispatch(ScalarType alpha,
+                     MatrixTypeA const & A, ViennaCLTranspose transA,
+                     MatrixTypeB const & B, ViennaCLTranspose transB,
+                     ScalarType beta,
+                     MatrixTypeC & C)
+  {
+
+    if (transA == ViennaCLTrans && transB == ViennaCLTrans)
+      viennacl::linalg::prod_impl(viennacl::trans(A), viennacl::trans(B), C, alpha, beta);
+    else if (transA == ViennaCLTrans && transB == ViennaCLNoTrans)
+      viennacl::linalg::prod_impl(viennacl::trans(A), B, C, alpha, beta);
+    else if (transA == ViennaCLNoTrans && transB == ViennaCLTrans)
+      viennacl::linalg::prod_impl(A, viennacl::trans(B), C, alpha, beta);
+    else if (transA == ViennaCLNoTrans && transB == ViennaCLNoTrans)
+      viennacl::linalg::prod_impl(A, B, C, alpha, beta);
+    //else
+    //  return ViennaCLGenericFailure;
+  }
+}
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3_cuda.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3_cuda.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_cuda.cu
new file mode 100644
index 0000000..318593b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_cuda.cu
@@ -0,0 +1,133 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+#ifdef VIENNACL_WITH_CUDA
+
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+  template <typename NumericT>
+  ViennaCLStatus ViennaCLCUDAgemm_impl(ViennaCLBackend /*backend*/,
+                                       ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                       ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                       ViennaCLOrder orderC,
+                                       ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                       NumericT alpha,
+                                       NumericT *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                       NumericT *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                       NumericT beta,
+                                       NumericT *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+  {
+    ViennaCLInt A_size1 = (transA == ViennaCLTrans) ? k : m;
+    ViennaCLInt A_size2 = (transA == ViennaCLTrans) ? m : k;
+
+    ViennaCLInt B_size1 = (transB == ViennaCLTrans) ? n : k;
+    ViennaCLInt B_size2 = (transB == ViennaCLTrans) ? k : n;
+
+    bool A_row_major = (orderA == ViennaCLRowMajor);
+    bool B_row_major = (orderB == ViennaCLRowMajor);
+    bool C_row_major = (orderC == ViennaCLRowMajor);
+
+    viennacl::matrix_base<NumericT> matA(A, viennacl::CUDA_MEMORY,
+                                         A_size1, offA_row, incA_row, A_row_major ? m : lda,
+                                         A_size2, offA_col, incA_col, A_row_major ? lda : k, A_row_major);
+
+    viennacl::matrix_base<NumericT> matB(B, viennacl::CUDA_MEMORY,
+                                         B_size1, offB_row, incB_row, B_row_major ? k : ldb,
+                                         B_size2, offB_col, incB_col, B_row_major ? ldb : n, B_row_major);
+
+    viennacl::matrix_base<NumericT> matC(C, viennacl::CUDA_MEMORY,
+                                         m, offC_row, incC_row, C_row_major ? m : ldc,
+                                         n, offC_col, incC_col, C_row_major ? ldc : n, C_row_major);
+
+    detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+
+    return ViennaCLSuccess;
+  }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            float alpha,
+                                                            float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            float beta,
+                                                            float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLCUDAgemm_impl<float>(backend,
+                                              orderA, transA,
+                                              orderB, transB,
+                                              orderC,
+                                              m, n, k,
+                                              alpha,
+                                              A, offA_row, offA_col, incA_row, incA_col, lda,
+                                              B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                              beta,
+                                              C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            double alpha,
+                                                            double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            double beta,
+                                                            double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLCUDAgemm_impl<double>(backend,
+                                               orderA, transA,
+                                               orderB, transB,
+                                               orderC,
+                                               m, n, k,
+                                               alpha,
+                                               A, offA_row, offA_col, incA_row, incA_col, lda,
+                                               B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                               beta,
+                                               C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cpp
new file mode 100644
index 0000000..16ef310
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cpp
@@ -0,0 +1,131 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+  template <typename NumericT>
+  ViennaCLStatus ViennaCLHostgemm_impl(ViennaCLBackend /*backend*/,
+                                       ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                       ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                       ViennaCLOrder orderC,
+                                       ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                       NumericT alpha,
+                                       NumericT *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                       NumericT *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                       NumericT beta,
+                                       NumericT *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+  {
+    typedef typename viennacl::matrix_base<NumericT>::size_type           size_type;
+    typedef typename viennacl::matrix_base<NumericT>::size_type           difference_type;
+
+    size_type A_size1 = static_cast<size_type>((transA == ViennaCLTrans) ? k : m);
+    size_type A_size2 = static_cast<size_type>((transA == ViennaCLTrans) ? m : k);
+
+    size_type B_size1 = static_cast<size_type>((transB == ViennaCLTrans) ? n : k);
+    size_type B_size2 = static_cast<size_type>((transB == ViennaCLTrans) ? k : n);
+
+    bool A_row_major = (orderA == ViennaCLRowMajor);
+    bool B_row_major = (orderB == ViennaCLRowMajor);
+    bool C_row_major = (orderC == ViennaCLRowMajor);
+
+    viennacl::matrix_base<NumericT> matA(A, viennacl::MAIN_MEMORY,
+                                         A_size1, size_type(offA_row), difference_type(incA_row), size_type(A_row_major ? m : lda),
+                                         A_size2, size_type(offA_col), difference_type(incA_col), size_type(A_row_major ? lda : k), A_row_major);
+
+    viennacl::matrix_base<NumericT> matB(B, viennacl::MAIN_MEMORY,
+                                         B_size1, size_type(offB_row), difference_type(incB_row), size_type(B_row_major ? k : ldb),
+                                         B_size2, size_type(offB_col), difference_type(incB_col), size_type(B_row_major ? ldb : n), B_row_major);
+
+    viennacl::matrix_base<NumericT> matC(C, viennacl::MAIN_MEMORY,
+                                         size_type(m), size_type(offC_row), difference_type(incC_row), size_type(C_row_major ? m : ldc),
+                                         size_type(n), size_type(offC_col), difference_type(incC_col), size_type(C_row_major ? ldc : n), C_row_major);
+
+    detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+
+    return ViennaCLSuccess;
+  }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            float alpha,
+                                                            float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            float beta,
+                                                            float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLHostgemm_impl<float>(backend,
+                                              orderA, transA,
+                                              orderB, transB,
+                                              orderC,
+                                              m, n, k,
+                                              alpha,
+                                              A, offA_row, offA_col, incA_row, incA_col, lda,
+                                              B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                              beta,
+                                              C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            double alpha,
+                                                            double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            double beta,
+                                                            double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLHostgemm_impl<double>(backend,
+                                               orderA, transA,
+                                               orderB, transB,
+                                               orderC,
+                                               m, n, k,
+                                               alpha,
+                                               A, offA_row, offA_col, incA_row, incA_col, lda,
+                                               B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                               beta,
+                                               C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cu
new file mode 100644
index 0000000..16ef310
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cu
@@ -0,0 +1,131 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+  template <typename NumericT>
+  ViennaCLStatus ViennaCLHostgemm_impl(ViennaCLBackend /*backend*/,
+                                       ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                       ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                       ViennaCLOrder orderC,
+                                       ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                       NumericT alpha,
+                                       NumericT *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                       NumericT *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                       NumericT beta,
+                                       NumericT *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+  {
+    typedef typename viennacl::matrix_base<NumericT>::size_type           size_type;
+    typedef typename viennacl::matrix_base<NumericT>::size_type           difference_type;
+
+    size_type A_size1 = static_cast<size_type>((transA == ViennaCLTrans) ? k : m);
+    size_type A_size2 = static_cast<size_type>((transA == ViennaCLTrans) ? m : k);
+
+    size_type B_size1 = static_cast<size_type>((transB == ViennaCLTrans) ? n : k);
+    size_type B_size2 = static_cast<size_type>((transB == ViennaCLTrans) ? k : n);
+
+    bool A_row_major = (orderA == ViennaCLRowMajor);
+    bool B_row_major = (orderB == ViennaCLRowMajor);
+    bool C_row_major = (orderC == ViennaCLRowMajor);
+
+    viennacl::matrix_base<NumericT> matA(A, viennacl::MAIN_MEMORY,
+                                         A_size1, size_type(offA_row), difference_type(incA_row), size_type(A_row_major ? m : lda),
+                                         A_size2, size_type(offA_col), difference_type(incA_col), size_type(A_row_major ? lda : k), A_row_major);
+
+    viennacl::matrix_base<NumericT> matB(B, viennacl::MAIN_MEMORY,
+                                         B_size1, size_type(offB_row), difference_type(incB_row), size_type(B_row_major ? k : ldb),
+                                         B_size2, size_type(offB_col), difference_type(incB_col), size_type(B_row_major ? ldb : n), B_row_major);
+
+    viennacl::matrix_base<NumericT> matC(C, viennacl::MAIN_MEMORY,
+                                         size_type(m), size_type(offC_row), difference_type(incC_row), size_type(C_row_major ? m : ldc),
+                                         size_type(n), size_type(offC_col), difference_type(incC_col), size_type(C_row_major ? ldc : n), C_row_major);
+
+    detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+
+    return ViennaCLSuccess;
+  }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            float alpha,
+                                                            float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            float beta,
+                                                            float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLHostgemm_impl<float>(backend,
+                                              orderA, transA,
+                                              orderB, transB,
+                                              orderC,
+                                              m, n, k,
+                                              alpha,
+                                              A, offA_row, offA_col, incA_row, incA_col, lda,
+                                              B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                              beta,
+                                              C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            double alpha,
+                                                            double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            double beta,
+                                                            double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLHostgemm_impl<double>(backend,
+                                               orderA, transA,
+                                               orderB, transB,
+                                               orderC,
+                                               m, n, k,
+                                               alpha,
+                                               A, offA_row, offA_col, incA_row, incA_col, lda,
+                                               B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                               beta,
+                                               C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cpp
new file mode 100644
index 0000000..d5e5c1e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cpp
@@ -0,0 +1,136 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+#ifdef VIENNACL_WITH_OPENCL
+
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+  template <typename NumericT>
+  ViennaCLStatus ViennaCLOpenCLgemm_impl(ViennaCLBackend backend,
+                                         ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                         ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                         ViennaCLOrder orderC,
+                                         ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                         NumericT alpha,
+                                         cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                         cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                         NumericT beta,
+                                         cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+  {
+    typedef typename viennacl::matrix_base<NumericT>::size_type           size_type;
+    typedef typename viennacl::matrix_base<NumericT>::size_type           difference_type;
+
+    size_type A_size1 = static_cast<size_type>((transA == ViennaCLTrans) ? k : m);
+    size_type A_size2 = static_cast<size_type>((transA == ViennaCLTrans) ? m : k);
+
+    size_type B_size1 = static_cast<size_type>((transB == ViennaCLTrans) ? n : k);
+    size_type B_size2 = static_cast<size_type>((transB == ViennaCLTrans) ? k : n);
+
+    bool A_row_major = (orderA == ViennaCLRowMajor);
+    bool B_row_major = (orderB == ViennaCLRowMajor);
+    bool C_row_major = (orderC == ViennaCLRowMajor);
+
+    viennacl::matrix_base<NumericT> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                         A_size1, size_type(offA_row), difference_type(incA_row), size_type(A_row_major ? m : lda),
+                                         A_size2, size_type(offA_col), difference_type(incA_col), size_type(A_row_major ? lda : k), A_row_major);
+
+    viennacl::matrix_base<NumericT> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                         B_size1, size_type(offB_row), difference_type(incB_row), size_type(B_row_major ? k : ldb),
+                                         B_size2, size_type(offB_col), difference_type(incB_col), size_type(B_row_major ? ldb : n), B_row_major);
+
+    viennacl::matrix_base<NumericT> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                         size_type(m), size_type(offC_row), difference_type(incC_row), size_type(C_row_major ? m : ldc),
+                                         size_type(n), size_type(offC_col), difference_type(incC_col), size_type(C_row_major ? ldc : n), C_row_major);
+
+    detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+
+    return ViennaCLSuccess;
+  }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemm(ViennaCLBackend backend,
+                                                              ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                              ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                              ViennaCLOrder orderC,
+                                                              ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                              float alpha,
+                                                              cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                              float beta,
+                                                              cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLOpenCLgemm_impl<float>(backend,
+                                                orderA, transA,
+                                                orderB, transB,
+                                                orderC,
+                                                m, n, k,
+                                                alpha,
+                                                A, offA_row, offA_col, incA_row, incA_col, lda,
+                                                B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                                beta,
+                                                C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemm(ViennaCLBackend backend,
+                                                              ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                              ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                              ViennaCLOrder orderC,
+                                                              ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                              double alpha,
+                                                              cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                              double beta,
+                                                              cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLOpenCLgemm_impl<double>(backend,
+                                                 orderA, transA,
+                                                 orderB, transB,
+                                                 orderC,
+                                                 m, n, k,
+                                                 alpha,
+                                                 A, offA_row, offA_col, incA_row, incA_col, lda,
+                                                 B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                                 beta,
+                                                 C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cu
new file mode 100644
index 0000000..d5e5c1e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cu
@@ -0,0 +1,136 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+#ifdef VIENNACL_WITH_OPENCL
+
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+  template <typename NumericT>
+  ViennaCLStatus ViennaCLOpenCLgemm_impl(ViennaCLBackend backend,
+                                         ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                         ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                         ViennaCLOrder orderC,
+                                         ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                         NumericT alpha,
+                                         cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                         cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                         NumericT beta,
+                                         cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+  {
+    typedef typename viennacl::matrix_base<NumericT>::size_type           size_type;
+    typedef typename viennacl::matrix_base<NumericT>::size_type           difference_type;
+
+    size_type A_size1 = static_cast<size_type>((transA == ViennaCLTrans) ? k : m);
+    size_type A_size2 = static_cast<size_type>((transA == ViennaCLTrans) ? m : k);
+
+    size_type B_size1 = static_cast<size_type>((transB == ViennaCLTrans) ? n : k);
+    size_type B_size2 = static_cast<size_type>((transB == ViennaCLTrans) ? k : n);
+
+    bool A_row_major = (orderA == ViennaCLRowMajor);
+    bool B_row_major = (orderB == ViennaCLRowMajor);
+    bool C_row_major = (orderC == ViennaCLRowMajor);
+
+    viennacl::matrix_base<NumericT> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                         A_size1, size_type(offA_row), difference_type(incA_row), size_type(A_row_major ? m : lda),
+                                         A_size2, size_type(offA_col), difference_type(incA_col), size_type(A_row_major ? lda : k), A_row_major);
+
+    viennacl::matrix_base<NumericT> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                         B_size1, size_type(offB_row), difference_type(incB_row), size_type(B_row_major ? k : ldb),
+                                         B_size2, size_type(offB_col), difference_type(incB_col), size_type(B_row_major ? ldb : n), B_row_major);
+
+    viennacl::matrix_base<NumericT> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                         size_type(m), size_type(offC_row), difference_type(incC_row), size_type(C_row_major ? m : ldc),
+                                         size_type(n), size_type(offC_col), difference_type(incC_col), size_type(C_row_major ? ldc : n), C_row_major);
+
+    detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+
+    return ViennaCLSuccess;
+  }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemm(ViennaCLBackend backend,
+                                                              ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                              ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                              ViennaCLOrder orderC,
+                                                              ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                              float alpha,
+                                                              cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                              float beta,
+                                                              cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLOpenCLgemm_impl<float>(backend,
+                                                orderA, transA,
+                                                orderB, transB,
+                                                orderC,
+                                                m, n, k,
+                                                alpha,
+                                                A, offA_row, offA_col, incA_row, incA_col, lda,
+                                                B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                                beta,
+                                                C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemm(ViennaCLBackend backend,
+                                                              ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                              ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                              ViennaCLOrder orderC,
+                                                              ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                              double alpha,
+                                                              cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                              double beta,
+                                                              cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLOpenCLgemm_impl<double>(backend,
+                                                 orderA, transA,
+                                                 orderB, transB,
+                                                 orderC,
+                                                 m, n, k,
+                                                 alpha,
+                                                 A, offA_row, offA_col, incA_row, incA_col, lda,
+                                                 B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                                 beta,
+                                                 C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+
+#endif


[44/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/southern_islands/tahiti.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/southern_islands/tahiti.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/southern_islands/tahiti.hpp
new file mode 100644
index 0000000..aec9043
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/southern_islands/tahiti.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_SOUTHERN_ISLANDS_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_SOUTHERN_ISLANDS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace southern_islands{
+namespace tahiti{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,32,16,8,1,1,16,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,16,16,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,4,64));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(2,8,2,16,4,2,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,16,4,4,4,2,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_LOCAL,4,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,8,32,32,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,8,32,32,4,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,64,32,4,4,2,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,128,32,2,2,1,8,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_LOCAL,32,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/volcanic_islands/hawaii.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/volcanic_islands/hawaii.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/volcanic_islands/hawaii.hpp
new file mode 100644
index 0000000..c2674f0
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/volcanic_islands/hawaii.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_VOLCANIC_ISLANDS_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_VOLCANIC_ISLANDS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace volcanic_islands{
+namespace hawaii{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(1,8,16,32,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(1,8,16,32,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(2,8,8,8,6,1,6,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(1,16,16,16,2,1,8,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(1,1,2,64,8,2,4,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(1,16,16,16,2,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(2,16,16,16,6,1,6,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(1,64,64,4,2,4,8,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_LOCAL,64,4));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/fallback.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/fallback.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/fallback.hpp
new file mode 100644
index 0000000..ff307f3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/fallback.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_FALLBACK_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_FALLBACK_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace fallback{
+
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gt_540m.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gt_540m.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gt_540m.hpp
new file mode 100644
index 0000000..24c02b7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gt_540m.hpp
@@ -0,0 +1,59 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_GT540M_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_GT540M_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace fermi{
+namespace geforce_gt_540m{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GT 540M", matrix_product_template::parameters_type(1, 16, 16, 8, 4, 1, 8, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 16, 8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GT 540M", matrix_product_template::parameters_type(1, 16, 16, 16, 8, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 32, 8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GT 540M", matrix_product_template::parameters_type(1, 8, 16, 16, 8, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 16, 8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_470.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_470.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_470.hpp
new file mode 100644
index 0000000..31a329b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_470.hpp
@@ -0,0 +1,83 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_GEFORCE_GTX_470_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_GEFORCE_GTX_470_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace fermi{
+namespace geforce_gtx_470{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,2,32,32,4,1,2,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,32,2));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,8,16,8,2,2,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,4));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,128,32,1,2,1,8,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,16,32,4,4,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,2,16,64,8,1,2,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_CONTIGUOUS,16,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,32,32,16,2,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,8,16,32,8,2,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,16,32,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_580.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_580.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_580.hpp
new file mode 100644
index 0000000..7015ea5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_580.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_GTX580_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_GTX580_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace fermi{
+namespace geforce_gtx_580{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,2,1,128,4,1,4,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,16,128,32,2,4,2,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,32,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,4,64,128,4,1,2,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,128,32,1,1,1,8,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_LOCAL,32,4));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,2,32,32,8,1,2,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,32,2));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,16,32,16,4,4,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(2,16,16,16,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,128,16,2,4,1,8,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/tesla_c2050.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/tesla_c2050.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/tesla_c2050.hpp
new file mode 100644
index 0000000..f430d6c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/tesla_c2050.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_TESLA_C2050_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_TESLA_C2050_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace fermi{
+namespace tesla_c2050{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,4,32,32,8,1,2,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,16,8));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,16,32,32,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,32,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,4,128,64,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_CONTIGUOUS,16,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,16,16,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,16,32,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,16,32,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,16,16,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,16,32,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/kepler/tesla_k20m.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/kepler/tesla_k20m.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/kepler/tesla_k20m.hpp
new file mode 100644
index 0000000..73a62fc
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/kepler/tesla_k20m.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_KEPLER_K20M_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_KEPLER_K20M_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace kepler{
+namespace tesla_k20m{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,2,8,32,8,2,4,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,4,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,16,16,32,2,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,32));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,2,8,64,16,1,2,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,32,4));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,128,32,1,1,1,16,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_LOCAL,16,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,8,32,16,4,8,4,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,8,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,32,16,32,8,2,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,64));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(4,8,2,4,8,2,8,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,128,64,1,4,2,16,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_LOCAL,16,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/maxwell/geforce_gtx_750_ti.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/maxwell/geforce_gtx_750_ti.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/maxwell/geforce_gtx_750_ti.hpp
new file mode 100644
index 0000000..2c3f080
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/maxwell/geforce_gtx_750_ti.hpp
@@ -0,0 +1,85 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_3_GEFORCE_GTX_750_TI_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_3_GEFORCE_GTX_750_TI_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace maxwell{
+namespace geforce_gtx_750_ti{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,2,8,128,1,1,2,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,8,32,32,2,1,2,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,16,8,32,1,8,2,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,16,8,32,1,2,2,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,4,32,16,8,2,4,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,16,4));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,16,16,16,4,2,8,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,16,16,32,4,8,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,16,16,16,8,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/tesla/geforce_gtx_260.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/tesla/geforce_gtx_260.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/tesla/geforce_gtx_260.hpp
new file mode 100644
index 0000000..88dd596
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/tesla/geforce_gtx_260.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_TESLA_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_TESLA_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace tesla{
+namespace geforce_gtx_260{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,32,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,32,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,32,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,32,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,16,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,16,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,16,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,16,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/matrix_product.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/matrix_product.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/matrix_product.hpp
new file mode 100644
index 0000000..4437956
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/matrix_product.hpp
@@ -0,0 +1,244 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_MATRIX_PRODUCT_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_MATRIX_PRODUCT_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/tesla_c2050.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_470.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/maxwell/geforce_gtx_750_ti.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/tesla/geforce_gtx_260.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/southern_islands/tahiti.hpp"
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/kepler/tesla_k20m.hpp"
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_580.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/volcanic_islands/hawaii.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp"
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp"
+
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gt_540m.hpp"
+
+
+#include "viennacl/ocl/device_utils.hpp"
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp"
+#include "viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp"
+#include "viennacl/device_specific/builtin_database/devices/gpu/fallback.hpp"
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gt_540m.hpp"
+
+/** @file viennacl/device_specific/builtin_database/matrix_product.hpp
+*
+* Initializes the device database with the provided profiles. Updated semi-automatically.
+*/
+
+namespace viennacl
+{
+namespace device_specific
+{
+namespace builtin_database
+{
+
+inline database_type<matrix_product_template::parameters_type> init_matrix_product_N_N()
+{
+  database_type<matrix_product_template::parameters_type> result;
+
+  devices::accelerator::fallback::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::accelerator::fallback::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+
+  devices::cpu::fallback::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::cpu::fallback::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+
+  devices::gpu::fallback::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::fallback::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+
+  devices::gpu::amd::evergreen::cedar::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::amd::evergreen::cypress::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::amd::volcanic_islands::hawaii::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::amd::volcanic_islands::hawaii::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_580::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_580::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::kepler::tesla_k20m::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::kepler::tesla_k20m::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::amd::southern_islands::tahiti::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::amd::southern_islands::tahiti::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::amd::northern_islands::devastator::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::tesla::geforce_gtx_260::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::amd::northern_islands::scrapper::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_470::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_470::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::fermi::tesla_c2050::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::fermi::tesla_c2050::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+  devices::gpu::amd::northern_islands::barts::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+
+  return result;
+}
+
+inline database_type<matrix_product_template::parameters_type> init_matrix_product_T_N()
+{
+  database_type<matrix_product_template::parameters_type> result;
+
+  devices::accelerator::fallback::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::accelerator::fallback::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+
+  devices::cpu::fallback::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::cpu::fallback::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+
+  devices::gpu::fallback::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::fallback::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::fermi::geforce_gt_540m::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::amd::evergreen::cedar::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::amd::evergreen::cypress::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::amd::volcanic_islands::hawaii::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::amd::volcanic_islands::hawaii::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_580::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_580::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::kepler::tesla_k20m::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::kepler::tesla_k20m::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::amd::southern_islands::tahiti::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::amd::southern_islands::tahiti::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::amd::northern_islands::devastator::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::tesla::geforce_gtx_260::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::amd::northern_islands::scrapper::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_470::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_470::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::fermi::tesla_c2050::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::nvidia::fermi::tesla_c2050::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+  devices::gpu::amd::northern_islands::barts::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+
+  return result;
+}
+
+inline database_type<matrix_product_template::parameters_type> init_matrix_product_N_T()
+{
+  database_type<matrix_product_template::parameters_type> result;
+
+  devices::accelerator::fallback::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::accelerator::fallback::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+
+  devices::cpu::fallback::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::cpu::fallback::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+
+  devices::gpu::fallback::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::fallback::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+
+  devices::gpu::nvidia::fermi::geforce_gt_540m::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::amd::evergreen::cedar::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::amd::evergreen::cypress::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::amd::volcanic_islands::hawaii::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::amd::volcanic_islands::hawaii::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_580::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_580::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::kepler::tesla_k20m::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::kepler::tesla_k20m::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::amd::southern_islands::tahiti::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::amd::southern_islands::tahiti::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::amd::northern_islands::devastator::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::tesla::geforce_gtx_260::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+
+
+  devices::gpu::amd::northern_islands::scrapper::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_470::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_470::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::fermi::tesla_c2050::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::fermi::tesla_c2050::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+  devices::gpu::amd::northern_islands::barts::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+
+  return result;
+}
+
+inline database_type<matrix_product_template::parameters_type> init_matrix_product_T_T()
+{
+  database_type<matrix_product_template::parameters_type> result;
+
+  devices::accelerator::fallback::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::accelerator::fallback::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+
+  devices::cpu::fallback::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::cpu::fallback::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+
+  devices::gpu::fallback::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::fallback::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::amd::evergreen::cedar::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::amd::evergreen::cypress::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::amd::volcanic_islands::hawaii::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::amd::volcanic_islands::hawaii::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_580::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_580::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::kepler::tesla_k20m::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::kepler::tesla_k20m::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::amd::southern_islands::tahiti::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::amd::southern_islands::tahiti::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::amd::northern_islands::devastator::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::tesla::geforce_gtx_260::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::amd::northern_islands::scrapper::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_470::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::fermi::geforce_gtx_470::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::fermi::tesla_c2050::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::nvidia::fermi::tesla_c2050::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+  devices::gpu::amd::northern_islands::barts::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+
+  return result;
+}
+
+static database_type<matrix_product_template::parameters_type> matrix_product_N_N = init_matrix_product_N_N();
+static database_type<matrix_product_template::parameters_type> matrix_product_T_N = init_matrix_product_T_N();
+static database_type<matrix_product_template::parameters_type> matrix_product_N_T = init_matrix_product_N_T();
+static database_type<matrix_product_template::parameters_type> matrix_product_T_T = init_matrix_product_T_T();
+
+template<class NumericT>
+matrix_product_template::parameters_type const & matrix_product_params(ocl::device const & device, char A_trans, char B_trans)
+{
+  assert(A_trans=='N' || A_trans=='T');
+  assert(B_trans=='N' || B_trans=='T');
+  database_type<matrix_product_template::parameters_type> * db;
+  if (A_trans=='N' && B_trans=='N')
+    db = &matrix_product_N_N;
+  else if (A_trans=='T' && B_trans=='N')
+    db = &matrix_product_T_N;
+  else if (A_trans=='N' && B_trans=='T')
+    db = &matrix_product_N_T;
+  else
+    db = &matrix_product_T_T;
+  return get_parameters<NumericT>(*db, device);
+}
+
+
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/execute.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/execute.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/execute.hpp
new file mode 100644
index 0000000..2f4960a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/execute.hpp
@@ -0,0 +1,55 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_EXECUTE_HPP
+#define VIENNACL_DEVICE_SPECIFIC_EXECUTE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/execute.hpp
+    @brief the user interface for the code generator
+*/
+
+#include <cstring>
+#include <vector>
+#include <typeinfo>
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/templates/template_base.hpp"
+#include "viennacl/device_specific/tree_parsing.hpp"
+#include "viennacl/device_specific/execution_handler.hpp"
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/tools/timer.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+
+inline void execute(template_base const & T, statements_container const & statements, viennacl::ocl::context & ctx = viennacl::ocl::current_context(), bool force_compilation = false)
+{
+  //Generate program name
+  std::string program_name = tree_parsing::statements_representation(statements, BIND_TO_HANDLE);
+  execution_handler handler(program_name, ctx, ctx.current_device(), force_compilation);
+  handler.add(program_name, T, statements);
+  handler.execute(program_name, statements);
+}
+
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/execution_handler.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/execution_handler.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/execution_handler.hpp
new file mode 100644
index 0000000..8f725fd
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/execution_handler.hpp
@@ -0,0 +1,102 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_EXECUTION_HANDLER_HPP
+#define VIENNACL_DEVICE_SPECIFIC_EXECUTION_HANDLER_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/execution_handler.hpp
+    @brief Helper for handling fallbacks, lazy compilation, input-dependent kernels, etc
+*/
+
+#include <map>
+
+#include "viennacl/tools/shared_ptr.hpp"
+
+#include "viennacl/device_specific/lazy_program_compiler.hpp"
+#include "viennacl/device_specific/templates/template_base.hpp"
+#include "viennacl/device_specific/utils.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+
+class execution_handler
+{
+public:
+  typedef std::map< std::string, tools::shared_ptr<template_base> > container_type;
+
+private:
+  std::string append_prefix(std::string const & str)
+  {
+    return "_" + str;
+  }
+
+  std::string define_extension(std::string const & ext)
+  {
+    // Note: On devices without double precision support, 'ext' is an empty string.
+    return (ext.length() > 1) ? std::string("#pragma OPENCL EXTENSION " + ext + " : enable\n") : std::string("\n");
+  }
+
+  void init_program_compiler(std::string const & name, bool force_recompilation)
+  {
+    lazy_programs_.push_back(lazy_program_compiler(&ctx_, name, force_recompilation));
+    lazy_programs_.back().add(define_extension(device_.double_support_extension()));
+  }
+
+public:
+  execution_handler(std::string const & program_name_base, viennacl::ocl::context & ctx, viennacl::ocl::device const & device, bool force_recompilation = false) : ctx_(ctx), device_(device), program_names_(2)
+  {
+    lazy_programs_.reserve(2);
+    init_program_compiler(program_name_base + "_0", force_recompilation);
+    init_program_compiler(program_name_base + "_1", force_recompilation);
+  }
+
+  void add(std::string const & key, template_base const & T, statements_container const & statements)
+  {
+    if (kernels_.insert(container_type::value_type(key, T.clone())).second)
+    {
+      std::vector<std::string> sources = at(kernels_, key)->generate(append_prefix(key), statements, device_);
+      assert(sources.size()<=2);
+      for (unsigned int i = 0; i < sources.size(); ++i)
+        lazy_programs_[i].add(sources[i]);
+    }
+  }
+
+  template_base * template_of(std::string const & key)
+  {
+    return at(kernels_, key).get();
+  }
+
+  void execute(container_type::key_type const & key, statements_container const & statements)
+  {
+    tools::shared_ptr<template_base> & template_pointer = at(kernels_, key);
+    template_pointer->enqueue(append_prefix(key), lazy_programs_, statements);
+  }
+
+private:
+  viennacl::ocl::context & ctx_;
+  viennacl::ocl::device const & device_;
+  container_type kernels_;
+  std::vector<std::string> program_names_;
+  std::vector<lazy_program_compiler> lazy_programs_;
+};
+
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/forwards.h
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/forwards.h b/native-viennaCL/src/main/cpp/viennacl/device_specific/forwards.h
new file mode 100644
index 0000000..590ed1f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/forwards.h
@@ -0,0 +1,294 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_FORWARDS_H
+#define VIENNACL_DEVICE_SPECIFIC_FORWARDS_H
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/forwards.h
+    @brief Forwards declaration
+*/
+
+#include <list>
+#include <map>
+#include <set>
+#include <stdexcept>
+
+#include "viennacl/scheduler/io.hpp"
+
+#include "viennacl/ocl/forwards.h"
+#include "viennacl/tools/shared_ptr.hpp"
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/backend/mem_handle.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+
+//Error codes
+static const int TEMPLATE_VALID = 0;
+static const int TEMPLATE_LOCAL_MEMORY_OVERFLOW = -1;
+static const int TEMPLATE_WORK_GROUP_SIZE_OVERFLOW = -2;
+static const int TEMPLATE_LOCAL_SIZE_0_OVERFLOW = -3;
+static const int TEMPLATE_LOCAL_SIZE_1_OVERFLOW = -4;
+static const int TEMPLATE_LOCAL_SIZE_2_OVERFLOW = -5;
+static const int TEMPLATE_LOCAL_SIZE_NOT_WARP_MULTIPLE = -6;
+static const int TEMPLATE_INVALID_SIMD_WIDTH = -7;
+static const int TEMPLATE_INVALID_FETCHING_POLICY_TYPE= -9;
+
+static const int TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH = -10;
+static const int TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE = -11;
+static const int TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL = -12;
+static const int TEMPLATE_SIMD_WIDTH_MUST_BE_ONE = -13;
+static const int TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT = -14;
+static const int TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE = -15;
+static const int TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE = -16;
+static const int TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE = -17;
+static const int TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE = -18;
+
+struct index_tuple
+{
+  index_tuple(std::string const & _i, std::string const & _bound0) : i(_i), bound0(_bound0), j(""), bound1(""){ }
+  index_tuple(std::string const & _i, std::string const & _bound0, std::string const & _j, std::string const & _bound1) : i(_i), bound0(_bound0), j(_j), bound1(_bound1){ }
+  std::string i;
+  std::string bound0;
+  std::string j;
+  std::string bound1;
+};
+
+inline bool is_scalar_reduction(scheduler::statement_node const & node)
+{
+  return node.op.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE || node.op.type_family==scheduler::OPERATION_VECTOR_REDUCTION_TYPE_FAMILY;
+}
+
+inline bool is_vector_reduction(scheduler::statement_node const & node)
+{
+  return node.op.type==scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE
+      || node.op.type_family==scheduler::OPERATION_ROWS_REDUCTION_TYPE_FAMILY
+      || node.op.type_family==scheduler::OPERATION_COLUMNS_REDUCTION_TYPE_FAMILY;
+}
+
+inline scheduler::statement_node const & lhs_most(scheduler::statement::container_type const & array, vcl_size_t root)
+{
+  scheduler::statement_node const * current = &array[root];
+  while (current->lhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+    current = &array[current->lhs.node_index];
+  return *current;
+}
+
+enum expression_type
+{
+  SCALAR_AXPY_TYPE,
+  VECTOR_AXPY_TYPE,
+  MATRIX_AXPY_TYPE,
+  REDUCTION_TYPE,
+  ROW_WISE_REDUCTION_Nx_TYPE,
+  ROW_WISE_REDUCTION_Tx_TYPE,
+  MATRIX_PRODUCT_NN_TYPE,
+  MATRIX_PRODUCT_TN_TYPE,
+  MATRIX_PRODUCT_NT_TYPE,
+  MATRIX_PRODUCT_TT_TYPE,
+  INVALID_EXPRESSION_TYPE
+};
+
+inline const char * expression_type_to_string(expression_type type)
+{
+  switch (type)
+  {
+  case SCALAR_AXPY_TYPE : return "Scalar AXPY";
+  case VECTOR_AXPY_TYPE : return "Vector AXPY";
+  case MATRIX_AXPY_TYPE : return "Matrix AXPY";
+  case REDUCTION_TYPE : return "Reduction";
+  case ROW_WISE_REDUCTION_Nx_TYPE : return "Row-wise reduction: Ax";
+  case ROW_WISE_REDUCTION_Tx_TYPE : return "Row-wise reduction : Tx";
+  case MATRIX_PRODUCT_NN_TYPE : return "Matrix-Matrix Product : AA";
+  case MATRIX_PRODUCT_TN_TYPE : return "Matrix-Matrix Product : TA";
+  case MATRIX_PRODUCT_NT_TYPE : return "Matrix-Matrix Product : AT";
+  case MATRIX_PRODUCT_TT_TYPE : return "Matrix-Matrix Product : TT";
+  default : return "INVALID EXPRESSION";
+  }
+}
+
+/** @brief generate the string for a pointer kernel argument */
+static std::string generate_value_kernel_argument(std::string const & scalartype, std::string const & name)
+{
+  return scalartype + ' ' + name + ",";
+}
+
+/** @brief generate the string for a pointer kernel argument */
+static std::string generate_pointer_kernel_argument(std::string const & address_space, std::string const & scalartype, std::string const & name)
+{
+  return address_space +  " " + scalartype + "* " + name + ",";
+}
+
+/** @brief Emulation of C++11's .at() member for std::map<>, const-version */
+template<typename KeyT, typename ValueT>
+ValueT const & at(std::map<KeyT, ValueT> const & map, KeyT const & key)
+{
+  typename std::map<KeyT, ValueT>::const_iterator it = map.find(key);
+  if (it != map.end())
+    return it->second;
+
+  throw std::out_of_range("Generator: Key not found in map");
+}
+
+/** @brief Emulation of C++11's .at() member for std::map<>, non-const version */
+template<typename KeyT, typename ValueT>
+ValueT & at(std::map<KeyT, ValueT> & map, KeyT const & key)
+{
+  typename std::map<KeyT, ValueT>::iterator it = map.find(key);
+  if (it != map.end())
+    return it->second;
+
+  throw std::out_of_range("Generator: Key not found in map");
+}
+
+/** @brief Exception for the case the generator is unable to deal with the operation */
+class generator_not_supported_exception : public std::exception
+{
+public:
+  generator_not_supported_exception() : message_() {}
+  generator_not_supported_exception(std::string message) : message_("ViennaCL: Internal error: The generator cannot handle the statement provided: " + message) {}
+  virtual const char* what() const throw() { return message_.c_str(); }
+  virtual ~generator_not_supported_exception() throw() {}
+private:
+  std::string message_;
+};
+
+namespace utils
+{
+  class kernel_generation_stream;
+}
+
+
+enum leaf_t
+{
+  LHS_NODE_TYPE,
+  PARENT_NODE_TYPE,
+  RHS_NODE_TYPE
+};
+
+class mapped_object;
+class template_base;
+
+typedef std::pair<vcl_size_t, leaf_t> mapping_key;
+typedef std::map<mapping_key, tools::shared_ptr<mapped_object> > mapping_type;
+
+
+namespace tree_parsing
+{
+
+  template<class Fun>
+  inline void traverse(scheduler::statement const & statement, vcl_size_t root_idx, Fun const & fun, bool inspect);
+
+  inline void process(utils::kernel_generation_stream & stream, leaf_t leaf, std::string const & type_key, std::string const & to_process,
+                      scheduler::statement const & statement, vcl_size_t root_idx, mapping_type const & mapping, std::set<std::string> & already_processed);
+  inline std::string evaluate(leaf_t leaf, std::map<std::string, std::string> const & accessors, scheduler::statement const & statement, vcl_size_t root_idx,mapping_type const & mapping);
+}
+
+using scheduler::INT_TYPE;
+using scheduler::UINT_TYPE;
+using scheduler::ULONG_TYPE;
+using scheduler::LONG_TYPE;
+using scheduler::FLOAT_TYPE;
+using scheduler::DOUBLE_TYPE;
+
+typedef cl_uint vendor_id_type;
+typedef cl_device_type device_type;
+typedef std::string device_name_type;
+
+class symbolic_binder
+{
+public:
+  virtual ~symbolic_binder(){ }
+  virtual bool bind(viennacl::backend::mem_handle const * ph) = 0;
+  virtual unsigned int get(viennacl::backend::mem_handle const * ph) = 0;
+};
+
+class bind_to_handle : public symbolic_binder
+{
+public:
+  bind_to_handle() : current_arg_(0){ }
+  bool bind(viennacl::backend::mem_handle const * ph) {return (ph==NULL)?true:memory.insert(std::make_pair((void*)ph, current_arg_)).second; }
+  unsigned int get(viennacl::backend::mem_handle const * ph){ return bind(ph) ? current_arg_++ : at(memory, (void*)ph); }
+private:
+  unsigned int current_arg_;
+  std::map<void*,unsigned int> memory;
+};
+
+class bind_all_unique : public symbolic_binder
+{
+public:
+  bind_all_unique() : current_arg_(0){ }
+  bool bind(viennacl::backend::mem_handle const *) {return true; }
+  unsigned int get(viennacl::backend::mem_handle const *){ return current_arg_++; }
+private:
+  unsigned int current_arg_;
+  std::map<void*,unsigned int> memory;
+};
+
+enum binding_policy_t{
+  BIND_ALL_UNIQUE,
+  BIND_TO_HANDLE
+};
+
+inline tools::shared_ptr<symbolic_binder> make_binder(binding_policy_t policy)
+{
+  if (policy==BIND_TO_HANDLE)
+    return tools::shared_ptr<symbolic_binder>(new bind_to_handle());
+  else
+    return tools::shared_ptr<symbolic_binder>(new bind_all_unique());
+}
+
+template<char C>
+struct char_to_type{ };
+
+class statements_container
+{
+public:
+  typedef std::list<scheduler::statement> data_type;
+  enum order_type { SEQUENTIAL, INDEPENDENT };
+
+  statements_container(data_type const & data, order_type order) : data_(data), order_(order)
+  { }
+
+  statements_container(scheduler::statement const & s0) : order_(INDEPENDENT)
+  {
+    data_.push_back(s0);
+  }
+
+  statements_container(scheduler::statement const & s0, scheduler::statement const & s1, order_type order) : order_(order)
+  {
+    data_.push_back(s0);
+    data_.push_back(s1);
+  }
+
+  std::list<scheduler::statement> const & data() const { return data_; }
+
+  order_type order() const { return order_; }
+
+private:
+  std::list<scheduler::statement> data_;
+  order_type order_;
+};
+
+}
+
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/lazy_program_compiler.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/lazy_program_compiler.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/lazy_program_compiler.hpp
new file mode 100644
index 0000000..3e75b9b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/lazy_program_compiler.hpp
@@ -0,0 +1,74 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_LAZY_PROGRAM_COMPILER_HPP
+#define VIENNACL_DEVICE_SPECIFIC_LAZY_PROGRAM_COMPILER_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/lazy_program_compiler.hpp
+    @brief Helper for compiling a program lazily
+*/
+
+#include <map>
+
+#include "viennacl/ocl/context.hpp"
+
+namespace viennacl
+{
+
+namespace device_specific
+{
+
+  class lazy_program_compiler
+  {
+  public:
+
+    lazy_program_compiler(viennacl::ocl::context * ctx, std::string const & name, std::string const & src, bool force_recompilation) : ctx_(ctx), name_(name), src_(src), force_recompilation_(force_recompilation){ }
+    lazy_program_compiler(viennacl::ocl::context * ctx, std::string const & name, bool force_recompilation) : ctx_(ctx), name_(name), force_recompilation_(force_recompilation){ }
+
+    void add(std::string const & src) {  src_+=src; }
+
+    std::string const & src() const { return src_; }
+
+    viennacl::ocl::program & program()
+    {
+      if (force_recompilation_ && ctx_->has_program(name_))
+        ctx_->delete_program(name_);
+      if (!ctx_->has_program(name_))
+      {
+#ifdef VIENNACL_BUILD_INFO
+          std::cerr << "Creating program " << program_name << std::endl;
+#endif
+          ctx_->add_program(src_, name_);
+#ifdef VIENNACL_BUILD_INFO
+          std::cerr << "Done creating program " << program_name << std::endl;
+#endif
+      }
+      return ctx_->get_program(name_);
+    }
+
+  private:
+    viennacl::ocl::context * ctx_;
+    std::string name_;
+    std::string src_;
+    bool force_recompilation_;
+  };
+
+}
+
+}
+#endif


[04/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp
new file mode 100644
index 0000000..bd5116d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp
@@ -0,0 +1,1019 @@
+#ifndef VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/opencl/matrix_operations.hpp
+    @brief Implementations of dense matrix related operations, including matrix-vector products, using OpenCL.
+*/
+
+#include "viennacl/forwards.h"
+
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/svd.hpp"
+#include "viennacl/linalg/opencl/kernels/vector.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix_element.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+namespace detail
+{
+
+  template<typename NumericT>
+  viennacl::ocl::kernel & kernel_for_matrix(matrix_base<NumericT> const & M, std::string const & kernel_name)
+  {
+    viennacl::ocl::context & ctx = traits::opencl_context(M);
+    viennacl::ocl::program * program;
+    if (M.row_major())
+    {
+      typedef viennacl::linalg::opencl::kernels::matrix<NumericT, row_major>  KernelClass;
+      KernelClass::init(ctx);
+      program = &ctx.get_program(KernelClass::program_name());
+    }
+    else
+    {
+      typedef viennacl::linalg::opencl::kernels::matrix<NumericT, column_major>  KernelClass;
+      KernelClass::init(ctx);
+      program = &ctx.get_program(KernelClass::program_name());
+    }
+    return program->get_kernel(kernel_name);
+  }
+
+  template<typename NumericT>
+  viennacl::ocl::kernel & element_kernel_for_matrix(matrix_base<NumericT> const & M, std::string const & kernel_name)
+  {
+    viennacl::ocl::context & ctx = traits::opencl_context(M);
+    viennacl::ocl::program * program;
+    if (M.row_major())
+    {
+      typedef viennacl::linalg::opencl::kernels::matrix_element<NumericT, row_major>  KernelClass;
+      KernelClass::init(ctx);
+      program = &ctx.get_program(KernelClass::program_name());
+    }
+    else
+    {
+      typedef viennacl::linalg::opencl::kernels::matrix_element<NumericT, column_major>  KernelClass;
+      KernelClass::init(ctx);
+      program = &ctx.get_program(KernelClass::program_name());
+    }
+    return program->get_kernel(kernel_name);
+  }
+
+  template<typename NumericT>
+  viennacl::ocl::kernel & legacy_kernel_for_matrix(matrix_base<NumericT> const & M, std::string const & kernel_name)
+  {
+    viennacl::ocl::context & ctx = traits::opencl_context(M);
+    viennacl::ocl::program * program;
+    if (M.row_major())
+    {
+      typedef viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>  KernelClass;
+      KernelClass::init(ctx);
+      program = &ctx.get_program(KernelClass::program_name());
+    }
+    else
+    {
+      typedef viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>  KernelClass;
+      KernelClass::init(ctx);
+      program = &ctx.get_program(KernelClass::program_name());
+    }
+    return program->get_kernel(kernel_name);
+  }
+
+}
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+
+const std::string SVD_BIDIAG_PACK_KERNEL = "bidiag_pack";
+const std::string SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL = "house_update_A_left";
+const std::string SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL = "house_update_A_right";
+const std::string SVD_HOUSEHOLDER_UPDATE_QL_KERNEL = "house_update_QL";
+const std::string SVD_GIVENS_NEXT_KERNEL = "givens_next";
+const std::string SVD_COPY_COL_KERNEL = "copy_col";
+const std::string SVD_COPY_ROW_KERNEL = "copy_row";
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(matrix_base<DestNumericT> & dest, matrix_base<SrcNumericT> const & src)
+{
+  assert(dest.row_major() == src.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+  assert(viennacl::traits::opencl_handle(dest).context() == viennacl::traits::opencl_handle(src).context() && bool("Matrices do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  std::string kernel_name("convert_");
+  kernel_name += dest.row_major() ? "row_" : "col_";
+  kernel_name += viennacl::ocl::type_to_string<DestNumericT>::apply();
+  kernel_name += "_";
+  kernel_name += viennacl::ocl::type_to_string<SrcNumericT>::apply();
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(dest).context());
+  viennacl::linalg::opencl::kernels::matrix_convert::init(ctx);
+  viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::matrix_convert::program_name(), kernel_name);
+
+  viennacl::ocl::enqueue(k( dest, cl_uint(dest.start1()), cl_uint(dest.stride1()), cl_uint(dest.size1()), cl_uint(dest.internal_size1()), cl_uint(dest.start2()), cl_uint(dest.stride2()), cl_uint(dest.size2()), cl_uint(dest.internal_size2()),
+                            src,  cl_uint( src.start1()), cl_uint( src.stride1()), cl_uint( src.size1()), cl_uint( src.internal_size1()), cl_uint( src.start2()), cl_uint( src.stride2()), cl_uint( src.size2()), cl_uint( src.internal_size2())
+                        ) );
+}
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+
+template <typename NumericT,
+          typename ScalarT1>
+void am(matrix_base<NumericT> & mat1,
+        matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+  viennacl::ocl::kernel & k= detail::kernel_for_matrix(mat1, (viennacl::is_cpu_scalar<ScalarT1>::value ? "am_cpu" : "am_gpu"));
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
+                          cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)),
+                          cl_uint(viennacl::traits::stride1(mat1)),          cl_uint(viennacl::traits::stride2(mat1)),
+                          cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
+                          cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
+
+                          viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
+                          options_alpha,
+                          viennacl::traits::opencl_handle(mat2),
+                          cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)),
+                          cl_uint(viennacl::traits::stride1(mat2)),          cl_uint(viennacl::traits::stride2(mat2)),
+                          cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2))
+                          )
+                        );
+}
+
+
+template <typename NumericT,
+          typename ScalarT1, typename ScalarT2>
+void ambm(matrix_base<NumericT> & mat1,
+          matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+          matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  std::string kernel_name;
+  if      ( viennacl::is_cpu_scalar<ScalarT1>::value &&  viennacl::is_cpu_scalar<ScalarT2>::value)
+    kernel_name = "ambm_cpu_cpu";
+  else if ( viennacl::is_cpu_scalar<ScalarT1>::value && !viennacl::is_cpu_scalar<ScalarT2>::value)
+    kernel_name = "ambm_cpu_gpu";
+  else if (!viennacl::is_cpu_scalar<ScalarT1>::value &&  viennacl::is_cpu_scalar<ScalarT2>::value)
+    kernel_name = "ambm_gpu_cpu";
+  else
+    kernel_name = "ambm_gpu_gpu";
+
+  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat1, kernel_name);
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
+                          cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)),
+                          cl_uint(viennacl::traits::stride1(mat1)),          cl_uint(viennacl::traits::stride2(mat1)),
+                          cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
+                          cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
+
+                          viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
+                          options_alpha,
+                          viennacl::traits::opencl_handle(mat2),
+                          cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)),
+                          cl_uint(viennacl::traits::stride1(mat2)),          cl_uint(viennacl::traits::stride2(mat2)),
+                          cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2)),
+
+                          viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(beta)),
+                          options_beta,
+                          viennacl::traits::opencl_handle(mat3),
+                          cl_uint(viennacl::traits::start1(mat3)),           cl_uint(viennacl::traits::start2(mat3)),
+                          cl_uint(viennacl::traits::stride1(mat3)),          cl_uint(viennacl::traits::stride2(mat3)),
+                          cl_uint(viennacl::traits::internal_size1(mat3)),   cl_uint(viennacl::traits::internal_size2(mat3))
+                          )
+                        );
+}
+
+
+template <typename NumericT,
+          typename ScalarT1, typename ScalarT2>
+void ambm_m(matrix_base<NumericT> & mat1,
+            matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+            matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  std::string kernel_name;
+  if      ( viennacl::is_cpu_scalar<ScalarT1>::value &&  viennacl::is_cpu_scalar<ScalarT2>::value)
+    kernel_name = "ambm_m_cpu_cpu";
+  else if ( viennacl::is_cpu_scalar<ScalarT1>::value && !viennacl::is_cpu_scalar<ScalarT2>::value)
+    kernel_name = "ambm_m_cpu_gpu";
+  else if (!viennacl::is_cpu_scalar<ScalarT1>::value &&  viennacl::is_cpu_scalar<ScalarT2>::value)
+    kernel_name = "ambm_m_gpu_cpu";
+  else
+    kernel_name = "ambm_m_gpu_gpu";
+
+  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat1, kernel_name);
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
+                          cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)),
+                          cl_uint(viennacl::traits::stride1(mat1)),          cl_uint(viennacl::traits::stride2(mat1)),
+                          cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
+                          cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
+
+                          viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
+                          options_alpha,
+                          viennacl::traits::opencl_handle(mat2),
+                          cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)),
+                          cl_uint(viennacl::traits::stride1(mat2)),          cl_uint(viennacl::traits::stride2(mat2)),
+                          cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2)),
+
+                          viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(beta)),
+                          options_beta,
+                          viennacl::traits::opencl_handle(mat3),
+                          cl_uint(viennacl::traits::start1(mat3)),           cl_uint(viennacl::traits::start2(mat3)),
+                          cl_uint(viennacl::traits::stride1(mat3)),          cl_uint(viennacl::traits::stride2(mat3)),
+                          cl_uint(viennacl::traits::internal_size1(mat3)),   cl_uint(viennacl::traits::internal_size2(mat3))
+                          )
+                        );
+}
+
+template<typename NumericT,
+          typename SizeT, typename DistanceT>
+void trans(const matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>,const matrix_base<NumericT, SizeT, DistanceT>, op_trans> & proxy,
+           matrix_base<NumericT> & temp_trans)
+{
+  std::string kernel_name("trans_kernel");
+  viennacl::ocl::kernel& kernel = detail::legacy_kernel_for_matrix(proxy.lhs(),kernel_name);
+  viennacl::ocl::enqueue(kernel(proxy.lhs(),
+                                static_cast<cl_uint>(proxy.lhs().start1()),         static_cast<cl_uint>(proxy.lhs().start2()),
+                                static_cast<cl_uint>(proxy.lhs().internal_size1()), static_cast<cl_uint>(proxy.lhs().internal_size2()),
+                                static_cast<cl_uint>(proxy.lhs().size1()),          static_cast<cl_uint>(proxy.lhs().size2()),
+                                static_cast<cl_uint>(proxy.lhs().stride1()),        static_cast<cl_uint>(proxy.lhs().stride2()),
+
+                                temp_trans,
+                                static_cast<cl_uint>(temp_trans.start1()),         static_cast<cl_uint>(temp_trans.start2()),
+                                static_cast<cl_uint>(temp_trans.internal_size1()), static_cast<cl_uint>(temp_trans.internal_size2()),
+                                static_cast<cl_uint>(temp_trans.stride1()),        static_cast<cl_uint>(temp_trans.stride2())));
+}
+
+template <typename NumericT>
+void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
+{
+  cl_uint s1 = clear ? cl_uint(viennacl::traits::internal_size1(mat)) : cl_uint(viennacl::traits::size1(mat));
+  cl_uint s2 = clear ? cl_uint(viennacl::traits::internal_size2(mat)) : cl_uint(viennacl::traits::size2(mat));
+
+  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat, "assign_cpu");
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
+                           cl_uint(viennacl::traits::start1(mat)),           cl_uint(viennacl::traits::start2(mat)),
+                           cl_uint(viennacl::traits::stride1(mat)),          cl_uint(viennacl::traits::stride2(mat)),
+                           s1,                                               s2,
+                           cl_uint(viennacl::traits::internal_size1(mat)),   cl_uint(viennacl::traits::internal_size2(mat)),
+                           viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(s))
+                          )
+                        );
+}
+
+template <typename NumericT>
+void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
+{
+  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat, "diagonal_assign_cpu");
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
+                           cl_uint(viennacl::traits::start1(mat)),           cl_uint(viennacl::traits::start2(mat)),
+                           cl_uint(viennacl::traits::stride1(mat)),          cl_uint(viennacl::traits::stride2(mat)),
+                           cl_uint(viennacl::traits::size1(mat)),            cl_uint(viennacl::traits::size2(mat)),
+                           cl_uint(viennacl::traits::internal_size1(mat)),   cl_uint(viennacl::traits::internal_size2(mat)),
+                           viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(s))
+                          )
+                        );
+}
+
+template <typename NumericT>
+void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT> & mat)
+{
+  // Step 1: set everything to zero
+  matrix_assign(mat, NumericT(0));
+
+  // Step 2: set the diagonal:
+
+  // reuse vector ambm kernel for assigning the elements:
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+  typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
+  KernelClass::init(ctx);
+
+  cl_uint options_alpha = 0;
+  viennacl::ocl::packed_cl_uint size_mat;
+  if (mat.row_major())
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    size_mat.start  = cl_uint( (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+                              + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+  else
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    size_mat.start  = cl_uint(   viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+                              + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+
+  viennacl::ocl::packed_cl_uint size_vec;
+  size_vec.start  = cl_uint(viennacl::traits::start(vec));
+  size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+  size_vec.size   = cl_uint(viennacl::traits::size(vec));
+  size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+  viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
+  viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(mat),
+                              size_mat,
+
+                              viennacl::traits::opencl_handle(NumericT(1)),
+                              options_alpha,
+                              viennacl::traits::opencl_handle(vec),
+                              size_vec)
+                        );
+}
+
+template <typename NumericT>
+void matrix_diag_to_vector(const matrix_base<NumericT> & mat, int k, vector_base<NumericT> & vec)
+{
+  // reuse vector ambm kernel for assigning the elements:
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+  typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
+  KernelClass::init(ctx);
+
+  cl_uint options_alpha = 0;
+  viennacl::ocl::packed_cl_uint size_mat;
+  if (mat.row_major())
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    size_mat.start  = cl_uint( (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+                              + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+  else
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    size_mat.start  = cl_uint(   viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+                              + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+
+  viennacl::ocl::packed_cl_uint size_vec;
+  size_vec.start  = cl_uint(viennacl::traits::start(vec));
+  size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+  size_vec.size   = cl_uint(viennacl::traits::size(vec));
+  size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+
+  viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
+  viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
+                              size_vec,
+
+                              viennacl::traits::opencl_handle(NumericT(1)),
+                              options_alpha,
+                              viennacl::traits::opencl_handle(mat),
+                              size_mat)
+                        );
+}
+
+template <typename NumericT>
+void matrix_row(matrix_base<NumericT> const & mat, unsigned int i, vector_base<NumericT> & vec)
+{
+  // reuse vector ambm kernel for assigning the elements:
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+  typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
+  KernelClass::init(ctx);
+
+  cl_uint options_alpha = 0;
+  viennacl::ocl::packed_cl_uint size_mat;
+  if (mat.row_major())
+  {
+    size_mat.start  = cl_uint((viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride2(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+  else
+  {
+    size_mat.start  = cl_uint((viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+
+  viennacl::ocl::packed_cl_uint size_vec;
+  size_vec.start  = cl_uint(viennacl::traits::start(vec));
+  size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+  size_vec.size   = cl_uint(viennacl::traits::size(vec));
+  size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+
+  viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
+  viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
+                              size_vec,
+
+                              viennacl::traits::opencl_handle(NumericT(1)),
+                              options_alpha,
+                              viennacl::traits::opencl_handle(mat),
+                              size_mat)
+                        );
+}
+
+template <typename NumericT>
+void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec)
+{
+  // reuse vector ambm kernel for assigning the elements:
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+  typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
+  KernelClass::init(ctx);
+
+  cl_uint options_alpha = 0;
+  viennacl::ocl::packed_cl_uint size_mat;
+  if (mat.row_major())
+  {
+    size_mat.start  = cl_uint(viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+  else
+  {
+    size_mat.start  = cl_uint(viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride2(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+
+  viennacl::ocl::packed_cl_uint size_vec;
+  size_vec.start  = cl_uint(viennacl::traits::start(vec));
+  size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+  size_vec.size   = cl_uint(viennacl::traits::size(vec));
+  size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+
+  viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
+  viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
+                              size_vec,
+
+                              viennacl::traits::opencl_handle(NumericT(1)),
+                              options_alpha,
+                              viennacl::traits::opencl_handle(mat),
+                              size_mat)
+                        );
+}
+
+
+//
+///////////////////////// Element-wise operation //////////////////////////////////
+//
+
+// Binary operations A = B .* C and A = B ./ C
+/** @brief Implementation of binary element-wise operations A = OP(B,C)
+*
+* @param A      The result matrix (or -range, or -slice)
+* @param proxy  The proxy object holding B, C, and the operation
+*/
+template <typename T, typename OP>
+void element_op(matrix_base<T> & A,
+                matrix_expression<const matrix_base<T>, const matrix_base<T>, op_element_binary<OP> > const & proxy)
+{
+  assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Matrices do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+  assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Matrices do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::kernel & k = detail::kernel_for_matrix(A, "element_op");
+
+  cl_uint op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OP>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OP>::value)
+    op_type = 0;
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+                          cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)),
+                          cl_uint(viennacl::traits::stride1(A)),          cl_uint(viennacl::traits::stride2(A)),
+                          cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
+                          cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
+
+                          viennacl::traits::opencl_handle(proxy.lhs()),
+                          cl_uint(viennacl::traits::start1(proxy.lhs())),           cl_uint(viennacl::traits::start2(proxy.lhs())),
+                          cl_uint(viennacl::traits::stride1(proxy.lhs())),          cl_uint(viennacl::traits::stride2(proxy.lhs())),
+                          cl_uint(viennacl::traits::internal_size1(proxy.lhs())),   cl_uint(viennacl::traits::internal_size2(proxy.lhs())),
+
+                          viennacl::traits::opencl_handle(proxy.rhs()),
+                          cl_uint(viennacl::traits::start1(proxy.rhs())),           cl_uint(viennacl::traits::start2(proxy.rhs())),
+                          cl_uint(viennacl::traits::stride1(proxy.rhs())),          cl_uint(viennacl::traits::stride2(proxy.rhs())),
+                          cl_uint(viennacl::traits::internal_size1(proxy.rhs())),   cl_uint(viennacl::traits::internal_size2(proxy.rhs())),
+
+                          op_type)
+                        );
+}
+
+
+// Unary operations
+
+/** @brief Implementation of unary element-wise operations A = OP(B)
+*
+* @param A      The result matrix (or -range, or -slice)
+* @param proxy  The proxy object holding B and the operation
+*/
+template <typename T, typename OP>
+void element_op(matrix_base<T> & A,
+                matrix_expression<const matrix_base<T>, const matrix_base<T>, op_element_unary<OP> > const & proxy)
+{
+  assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Matrices do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+  assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Matrices do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::kernel & k = detail::element_kernel_for_matrix(A, detail::op_to_string(OP()) + "_assign");
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+                           cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)),
+                           cl_uint(viennacl::traits::stride1(A)),          cl_uint(viennacl::traits::stride2(A)),
+                           cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
+                           cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
+
+                           viennacl::traits::opencl_handle(proxy.lhs()),
+                           cl_uint(viennacl::traits::start1(proxy.lhs())),           cl_uint(viennacl::traits::start2(proxy.lhs())),
+                           cl_uint(viennacl::traits::stride1(proxy.lhs())),          cl_uint(viennacl::traits::stride2(proxy.lhs())),
+                           cl_uint(viennacl::traits::internal_size1(proxy.lhs())),   cl_uint(viennacl::traits::internal_size2(proxy.lhs())))
+                        );
+}
+
+
+//
+/////////////////////////   matrix-vector products /////////////////////////////////
+//
+
+// A * x
+
+/** @brief Carries out matrix-vector multiplication
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template <typename NumericT>
+void prod_impl(const matrix_base<NumericT> & mat, bool trans_A,
+               const vector_base<NumericT> & vec,
+                     vector_base<NumericT> & result)
+{
+  assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace transposed matrix-vector product possible. Introduce a temporary!"));
+
+  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat, trans_A ? "trans_vec_mul" : "vec_mul");
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
+                          cl_uint(viennacl::traits::start1(mat)),         cl_uint(viennacl::traits::start2(mat)),
+                          cl_uint(viennacl::traits::stride1(mat)),        cl_uint(viennacl::traits::stride2(mat)),
+                          cl_uint(viennacl::traits::size1(mat)),          cl_uint(viennacl::traits::size2(mat)),
+                          cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)),
+
+                          viennacl::traits::opencl_handle(vec),
+                          cl_uint(viennacl::traits::start(vec)),
+                          cl_uint(viennacl::traits::stride(vec)),
+                          cl_uint(viennacl::traits::size(vec)),
+
+                          viennacl::traits::opencl_handle(result),
+                          cl_uint(viennacl::traits::start(result)),
+                          cl_uint(viennacl::traits::stride(result)),
+                          cl_uint(viennacl::traits::size(result)),
+
+                          viennacl::ocl::local_mem(sizeof(NumericT) * k.local_work_size())
+                        ) );
+}
+
+
+//
+
+
+/** @brief Carries out matrix-matrix multiplication
+*
+* Implementation of C = prod(A, B);
+*
+*/
+template<typename NumericT, typename ScalarType >
+void prod_impl(matrix_base<NumericT> const & A, bool A_trans,
+               matrix_base<NumericT> const & B, bool B_trans,
+               matrix_base<NumericT>       & C,
+               ScalarType alpha,
+               ScalarType beta)
+{
+    bool effective_A_trans = A_trans ^ A.row_major();
+    bool effective_B_trans = B_trans ^ B.row_major();
+
+    char cAt = effective_A_trans ? 'T' : 'N';
+    char cBt = effective_B_trans ? 'T' : 'N';
+
+    std::string kernel_prefix("prod_");
+    kernel_prefix+=cAt;
+    kernel_prefix+=cBt;
+
+    scheduler::statement statement = scheduler::preset::mat_mat_prod(alpha, &A, effective_A_trans, &B, effective_B_trans, beta, &C);
+    kernels::matrix_prod<NumericT>::execution_handler(C.row_major(), viennacl::traits::opencl_context(C)).execute(kernel_prefix, statement);
+}
+
+//
+/////////////////////////   miscellaneous operations /////////////////////////////////
+//
+
+
+/** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update
+*
+* Implementation of the convenience expression result += alpha * outer_prod(vec1, vec2);
+*
+* @param A    The matrix to be updated
+* @param alpha            The scaling factor (either a viennacl::scalar<>, float, or double)
+* @param len_alpha        Length of the buffer for an eventual final reduction step (currently always '1')
+* @param reciprocal_alpha Use 1/alpha instead of alpha
+* @param flip_sign_alpha  Use -alpha instead of alpha
+* @param vec1    The first vector
+* @param vec2    The second vector
+*/
+template<typename NumericT, typename ScalarT1>
+void scaled_rank_1_update(matrix_base<NumericT> & A,
+                          ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                          const vector_base<NumericT> & vec1,
+                          const vector_base<NumericT> & vec2)
+{
+  assert( (viennacl::traits::size1(A) == viennacl::traits::size(vec1)) && bool("Size mismatch in scaled_rank_1_update: size1(A) != size(v1)"));
+  assert( (viennacl::traits::size2(A) == viennacl::traits::size(vec2)) && bool("Size mismatch in scaled_rank_1_update: size2(A) != size(v2)"));
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+  bool is_cpu = viennacl::is_cpu_scalar<ScalarT1>::value;
+  viennacl::ocl::kernel& kernel= detail::legacy_kernel_for_matrix(A, is_cpu ? "scaled_rank1_update_cpu" : "scaled_rank1_update_gpu");
+
+  viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(A),
+                           cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)),
+                           cl_uint(viennacl::traits::stride1(A)),          cl_uint(viennacl::traits::stride2(A)),
+                           cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
+                           cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
+
+                           viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
+                           options_alpha,
+
+                           viennacl::traits::opencl_handle(vec1),
+                           cl_uint(viennacl::traits::start(vec1)),
+                           cl_uint(viennacl::traits::stride(vec1)),
+                           cl_uint(viennacl::traits::size(vec1)),
+
+                           viennacl::traits::opencl_handle(vec2),
+                           cl_uint(viennacl::traits::start(vec2)),
+                           cl_uint(viennacl::traits::stride(vec2)),
+                           cl_uint(viennacl::traits::size(vec2))
+                          )
+                        );
+}
+
+//
+template <typename SCALARTYPE, typename VectorType>
+void bidiag_pack_svd(viennacl::matrix<SCALARTYPE>& A,
+                 VectorType & dh,
+                 VectorType & sh
+                )
+{
+  viennacl::vector<SCALARTYPE> D(dh.size());
+  viennacl::vector<SCALARTYPE> S(sh.size());
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_BIDIAG_PACK_KERNEL);
+
+  viennacl::ocl::enqueue(kernel(
+                                A,
+                                D,
+                                S,
+                                static_cast<cl_uint>(A.size1()),
+                                static_cast<cl_uint>(A.size2()),
+                                static_cast<cl_uint>(A.internal_size2())
+                              ));
+
+  fast_copy(D, dh);
+  fast_copy(S, sh);
+}
+
+
+template <typename NumericT>
+void bidiag_pack(matrix_base<NumericT> & A,
+                 viennacl::vector<NumericT> & dh,
+                 viennacl::vector<NumericT> & sh
+                )
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+  if(A.row_major())
+  {
+      viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+      viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::program_name(), SVD_BIDIAG_PACK_KERNEL);
+
+      viennacl::ocl::enqueue(kernel(
+                                    A,
+                                    dh,
+                                    sh,
+                                    cl_uint(viennacl::traits::size1(A)),
+                                    cl_uint(viennacl::traits::size2(A)),
+                                    cl_uint(viennacl::traits::internal_size2(A))
+                                  ));
+  }
+  else
+  {
+      viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::init(ctx);
+      viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::program_name(), SVD_BIDIAG_PACK_KERNEL);
+
+      viennacl::ocl::enqueue(kernel(
+                                    A,
+                                    dh,
+                                    sh,
+                                    cl_uint(viennacl::traits::size1(A)),
+                                    cl_uint(viennacl::traits::size2(A)),
+                                    cl_uint(viennacl::traits::internal_size2(A))
+                                  ));
+  }
+}
+
+
+template <typename NumericT>
+void house_update_A_left(matrix_base<NumericT> & A,
+                         vector_base<NumericT> & D,
+                         vcl_size_t start)
+{
+
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+    if(A.row_major())
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL);
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      D,
+                                      static_cast<cl_uint>(start + 1),
+                                      static_cast<cl_uint>(start),
+                                      cl_uint(viennacl::traits::size1(A)),
+                                      cl_uint(viennacl::traits::size2(A)),
+                                      cl_uint(viennacl::traits::internal_size2(A)),
+                                      viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
+                              ));
+    }
+    else
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL);
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      D,
+                                      static_cast<cl_uint>(start + 1),
+                                      static_cast<cl_uint>(start),
+                                      cl_uint(viennacl::traits::size1(A)),
+                                      cl_uint(viennacl::traits::size2(A)),
+                                      cl_uint(viennacl::traits::internal_size2(A)),
+                                      viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
+                              ));
+    }
+
+
+
+
+}
+
+template <typename NumericT>
+void house_update_A_right(matrix_base<NumericT> & A,
+                          vector_base<NumericT> & D)
+{
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+    if(A.row_major())
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL);
+
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      D,
+                                      static_cast<cl_uint>(0),
+                                      static_cast<cl_uint>(0),
+                                      cl_uint(viennacl::traits::size1(A)),
+                                      cl_uint(viennacl::traits::size2(A)),
+                                      cl_uint(viennacl::traits::internal_size2(A)),
+                                      viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
+                              ));
+    }
+    else
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL);
+
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      D,
+                                      static_cast<cl_uint>(0),
+                                      static_cast<cl_uint>(0),
+                                      cl_uint(viennacl::traits::size1(A)),
+                                      cl_uint(viennacl::traits::size2(A)),
+                                      cl_uint(viennacl::traits::internal_size2(A)),
+                                      viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
+                              ));
+    }
+
+
+}
+
+
+
+template <typename NumericT>
+void house_update_QL(matrix_base<NumericT> & Q,
+                     vector_base<NumericT> & D,
+                     vcl_size_t A_size1)
+
+{
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(Q).context());
+
+    if(Q.row_major())
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_QL_KERNEL);
+
+        viennacl::ocl::enqueue(kernel(
+                                        Q,
+                                        D,
+                                        cl_uint(A_size1),
+                                        cl_uint(viennacl::traits::internal_size2(Q)),
+                                        viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
+                                    ));
+    }
+    else
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_QL_KERNEL);
+
+        viennacl::ocl::enqueue(kernel(
+                                        Q,
+                                        D,
+                                        cl_uint(A_size1),
+                                        cl_uint(viennacl::traits::internal_size2(Q)),
+                                        viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
+                                    ));
+    }
+
+}
+
+
+template<typename NumericT>
+  void givens_next(matrix_base<NumericT> & matrix,
+                  vector_base<NumericT>& tmp1,
+                  vector_base<NumericT>& tmp2,
+                  int l,
+                  int m
+                )
+  {
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(matrix).context());
+
+    if(matrix.row_major())
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::program_name(), SVD_GIVENS_NEXT_KERNEL);
+        kernel.global_work_size(0, viennacl::tools::align_to_multiple<cl_uint>(cl_uint(viennacl::traits::size1(matrix)), 256));
+        kernel.local_work_size(0, 256);
+
+        viennacl::ocl::enqueue(kernel(
+                                      matrix,
+                                      tmp1,
+                                      tmp2,
+                                      cl_uint(viennacl::traits::size1(matrix)),
+                                      cl_uint(viennacl::traits::internal_size2(matrix)),
+                                      static_cast<cl_uint>(l),
+                                      static_cast<cl_uint>(m - 1)
+                              ));
+    }
+    else
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::program_name(), SVD_GIVENS_NEXT_KERNEL);
+        kernel.global_work_size(0, viennacl::tools::align_to_multiple<cl_uint>(cl_uint(viennacl::traits::size1(matrix)), 256));
+        kernel.local_work_size(0, 256);
+
+        viennacl::ocl::enqueue(kernel(
+                                      matrix,
+                                      tmp1,
+                                      tmp2,
+                                      cl_uint(viennacl::traits::size1(matrix)),
+                                      cl_uint(viennacl::traits::internal_size2(matrix)),
+                                      static_cast<cl_uint>(l),
+                                      static_cast<cl_uint>(m - 1)
+                              ));
+    }
+
+
+  }
+
+  template <typename NumericT>
+  void copy_vec(matrix_base<NumericT>& A,
+                vector_base<NumericT> & V,
+                vcl_size_t row_start,
+                vcl_size_t col_start,
+                bool copy_col
+  )
+  {
+    std::string kernel_name = copy_col ? SVD_COPY_COL_KERNEL : SVD_COPY_ROW_KERNEL;
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+    if(A.row_major())
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::program_name(), kernel_name);
+
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      V,
+                                      static_cast<cl_uint>(row_start),
+                                      static_cast<cl_uint>(col_start),
+                                      copy_col ? cl_uint(viennacl::traits::size1(A))
+                                               : cl_uint(viennacl::traits::size2(A)),
+                                      static_cast<cl_uint>(A.internal_size2())
+                              ));
+    }
+    else
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::program_name(), kernel_name);
+
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      V,
+                                      static_cast<cl_uint>(row_start),
+                                      static_cast<cl_uint>(col_start),
+                                      copy_col ? cl_uint(viennacl::traits::size1(A))
+                                               : cl_uint(viennacl::traits::size2(A)),
+                                      static_cast<cl_uint>(A.internal_size2())
+                              ));
+    }
+
+
+  }
+
+} // namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp
new file mode 100644
index 0000000..83a3db7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp
@@ -0,0 +1,69 @@
+#ifndef VIENNACL_LINALG_OPENCL_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/misc_operations.hpp
+    @brief Implementations of operations using compressed_matrix and OpenCL
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/kernels/ilu.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace detail
+{
+
+template<typename NumericT>
+void level_scheduling_substitute(vector<NumericT> & x,
+                                 viennacl::backend::mem_handle const & row_index_array,
+                                 viennacl::backend::mem_handle const & row_buffer,
+                                 viennacl::backend::mem_handle const & col_buffer,
+                                 viennacl::backend::mem_handle const & element_buffer,
+                                 vcl_size_t num_rows
+                                )
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+
+  viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "level_scheduling_substitute");
+
+  viennacl::ocl::enqueue(k(row_index_array.opencl_handle(), row_buffer.opencl_handle(), col_buffer.opencl_handle(), element_buffer.opencl_handle(),
+                           x,
+                           static_cast<cl_uint>(num_rows)));
+}
+
+} //namespace detail
+} // namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp
new file mode 100644
index 0000000..5daf297
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp
@@ -0,0 +1,139 @@
+#ifndef VIENNACL_LINALG_OPENCL_NMF_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_NMF_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+
+/** @file viennacl/linalg/opencl/vector_operations.hpp
+ @brief Implementations of NMF operations using OpenCL
+ */
+
+#include "viennacl/linalg/opencl/kernels/nmf.hpp"
+#include "viennacl/linalg/opencl/nmf_operations.hpp"
+
+#include "viennacl/linalg/host_based/nmf_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+/** @brief The nonnegative matrix factorization (approximation) algorithm as suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into matrices W and H such that ||V - W*H|| is minimized.
+ *
+ * @param V     Input matrix
+ * @param W     First factor
+ * @param H     Second factor
+ * @param conf  A configuration object holding tolerances and the like
+ */
+template<typename NumericT>
+void nmf(viennacl::matrix_base<NumericT> const & V,
+         viennacl::matrix_base<NumericT>       & W,
+         viennacl::matrix_base<NumericT>       & H,
+         viennacl::linalg::nmf_config const & conf)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(V).context());
+
+  const std::string NMF_MUL_DIV_KERNEL = "el_wise_mul_div";
+
+  viennacl::linalg::opencl::kernels::nmf<NumericT>::init(ctx);
+
+  vcl_size_t k = W.size2();
+  conf.iters_ = 0;
+
+  if (viennacl::linalg::norm_frobenius(W) <= 0)
+    W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1), ctx);
+
+  if (viennacl::linalg::norm_frobenius(H) <= 0)
+    H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1), ctx);
+
+  viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major(), ctx);
+  viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major(), ctx);
+  viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major(), ctx);
+
+  viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major(), ctx);
+  viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major(), ctx);
+  viennacl::matrix_base<NumericT> htmp(k, k, H.row_major(), ctx);
+
+  viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major(), ctx);
+
+  NumericT last_diff = 0;
+  NumericT diff_init = 0;
+  bool stagnation_flag = false;
+
+  for (vcl_size_t i = 0; i < conf.max_iterations(); i++)
+  {
+    conf.iters_ = i + 1;
+    {
+      hn = viennacl::linalg::prod(trans(W), V);
+      htmp = viennacl::linalg::prod(trans(W), W);
+      hd = viennacl::linalg::prod(htmp, H);
+
+      viennacl::ocl::kernel & mul_div_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<NumericT>::program_name(), NMF_MUL_DIV_KERNEL);
+      viennacl::ocl::enqueue(mul_div_kernel(H, hn, hd, cl_uint(H.internal_size1() * H.internal_size2())));
+    }
+    {
+      wn = viennacl::linalg::prod(V, trans(H));
+      wtmp = viennacl::linalg::prod(W, H);
+      wd = viennacl::linalg::prod(wtmp, trans(H));
+
+      viennacl::ocl::kernel & mul_div_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<NumericT>::program_name(), NMF_MUL_DIV_KERNEL);
+
+      viennacl::ocl::enqueue(mul_div_kernel(W, wn, wd, cl_uint(W.internal_size1() * W.internal_size2())));
+    }
+
+    if (i % conf.check_after_steps() == 0)  //check for convergence
+    {
+      appr = viennacl::linalg::prod(W, H);
+
+      appr -= V;
+      NumericT diff_val = viennacl::linalg::norm_frobenius(appr);
+
+      if (i == 0)
+        diff_init = diff_val;
+
+      if (conf.print_relative_error())
+        std::cout << diff_val / diff_init << std::endl;
+
+      // Approximation check
+      if (diff_val / diff_init < conf.tolerance())
+        break;
+
+      // Stagnation check
+      if (std::fabs(diff_val - last_diff) / (diff_val * NumericT(conf.check_after_steps())) < conf.stagnation_tolerance()) //avoid situations where convergence stagnates
+      {
+        if (stagnation_flag)    // iteration stagnates (two iterates with no notable progress)
+          break;
+        else
+          // record stagnation in this iteration
+          stagnation_flag = true;
+      } else
+        // good progress in this iteration, so unset stagnation flag
+        stagnation_flag = false;
+
+      // prepare for next iterate:
+      last_diff = diff_val;
+    }
+  }
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* VIENNACL_LINALG_OPENCL_NMF_OPERATIONS_HPP_ */

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp
new file mode 100644
index 0000000..a94681f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp
@@ -0,0 +1,205 @@
+#ifndef VIENNACL_LINALG_OPENCL_SCALAR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_SCALAR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/scalar_operations.hpp
+    @brief Implementations of scalar operations using OpenCL
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/kernels/scalar.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/traits/handle.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+template<typename ScalarT1,
+         typename ScalarT2, typename NumericT>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_any_scalar<NumericT>::value
+                            >::type
+as(ScalarT1 & s1,
+   ScalarT2 const & s2, NumericT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+  assert( &viennacl::traits::opencl_handle(s1).context() == &viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the same OpenCL context!"));
+
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        value_type;
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s1).context());
+  viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  bool is_cpu = viennacl::is_cpu_scalar<NumericT>::value;
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(), is_cpu ? "as_cpu" : "as_gpu");
+  k.local_work_size(0, 1);
+  k.global_work_size(0, 1);
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+                           viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+                           options_alpha,
+                           viennacl::traits::opencl_handle(s2) )
+                        );
+}
+
+
+template<typename ScalarT1,
+         typename ScalarT2, typename NumericT2,
+         typename ScalarT3, typename NumericT3>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_scalar<ScalarT3>::value
+                              && viennacl::is_any_scalar<NumericT2>::value
+                              && viennacl::is_any_scalar<NumericT3>::value
+                            >::type
+asbs(ScalarT1 & s1,
+     ScalarT2 const & s2, NumericT2 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+     ScalarT3 const & s3, NumericT3 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  assert( &viennacl::traits::opencl_handle(s1).context() == &viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the same OpenCL context!"));
+  assert( &viennacl::traits::opencl_handle(s2).context() == &viennacl::traits::opencl_handle(s3).context() && bool("Operands not in the same OpenCL context!"));
+
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        value_type;
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s1).context());
+  viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+  std::string kernel_name;
+  bool is_cpu_2 = viennacl::is_cpu_scalar<NumericT2>::value;
+  bool is_cpu_3 = viennacl::is_cpu_scalar<NumericT3>::value;
+  if (is_cpu_2 && is_cpu_3)
+    kernel_name = "asbs_cpu_cpu";
+  else if (is_cpu_2 && !is_cpu_3)
+    kernel_name = "asbs_cpu_gpu";
+  else if (!is_cpu_2 && is_cpu_3)
+    kernel_name = "asbs_gpu_cpu";
+  else
+    kernel_name = "asbs_gpu_gpu";
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(), kernel_name);
+  k.local_work_size(0, 1);
+  k.global_work_size(0, 1);
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+                           viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+                           options_alpha,
+                           viennacl::traits::opencl_handle(s2),
+                           viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
+                           options_beta,
+                           viennacl::traits::opencl_handle(s3) )
+                        );
+}
+
+
+template<typename ScalarT1,
+         typename ScalarT2, typename NumericT2,
+         typename ScalarT3, typename NumericT3>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_scalar<ScalarT3>::value
+                              && viennacl::is_any_scalar<NumericT2>::value
+                              && viennacl::is_any_scalar<NumericT3>::value
+                            >::type
+asbs_s(ScalarT1 & s1,
+       ScalarT2 const & s2, NumericT2 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+       ScalarT3 const & s3, NumericT3 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  assert( &viennacl::traits::opencl_handle(s1).context() == &viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the same OpenCL context!"));
+  assert( &viennacl::traits::opencl_handle(s2).context() == &viennacl::traits::opencl_handle(s3).context() && bool("Operands not in the same OpenCL context!"));
+
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        value_type;
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s1).context());
+  viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+  std::string kernel_name;
+  if (viennacl::is_cpu_scalar<NumericT2>::value && viennacl::is_cpu_scalar<NumericT3>::value)
+    kernel_name = "asbs_s_cpu_cpu";
+  else if (viennacl::is_cpu_scalar<NumericT2>::value && !viennacl::is_cpu_scalar<NumericT3>::value)
+    kernel_name = "asbs_s_cpu_gpu";
+  else if (!viennacl::is_cpu_scalar<NumericT2>::value && viennacl::is_cpu_scalar<NumericT3>::value)
+    kernel_name = "asbs_s_gpu_cpu";
+  else
+    kernel_name = "asbs_s_gpu_gpu";
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(), kernel_name);
+  k.local_work_size(0, 1);
+  k.global_work_size(0, 1);
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+                           viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+                           options_alpha,
+                           viennacl::traits::opencl_handle(s2),
+                           viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
+                           options_beta,
+                           viennacl::traits::opencl_handle(s3) )
+                        );
+}
+
+
+/** @brief Swaps the contents of two scalars, data is copied
+*
+* @param s1   The first scalar
+* @param s2   The second scalar
+*/
+template<typename ScalarT1, typename ScalarT2>
+typename viennacl::enable_if<    viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                            >::type
+swap(ScalarT1 & s1, ScalarT2 & s2)
+{
+  assert( &viennacl::traits::opencl_handle(s1).context() == &viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the same OpenCL context!"));
+
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        value_type;
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s1).context());
+  viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(), "swap");
+  k.local_work_size(0, 1);
+  k.global_work_size(0, 1);
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+                           viennacl::traits::opencl_handle(s2))
+                        );
+}
+
+
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif


[31/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp
new file mode 100644
index 0000000..51d99e1
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp
@@ -0,0 +1,2809 @@
+#ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/sparse_matrix_operations.hpp
+    @brief Implementations of operations using sparse matrices using CUDA
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+#include "viennacl/linalg/cuda/vector_operations.hpp"
+
+#include "viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp"
+
+//#ifdef VIENNACL_WITH_SPGEMM_RMERGE
+ #include "viennacl/linalg/cuda/spgemm_rmerge.hpp"
+//#else
+// #include "viennacl/linalg/cuda/spgemm.hpp"
+//#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+//
+// Compressed matrix
+//
+
+namespace detail
+{
+
+  template<typename NumericT>
+  __global__ void csr_row_info_extractor_kernel(
+            const unsigned int * row_indices,
+            const unsigned int * column_indices,
+            const NumericT * elements,
+            NumericT * result,
+            unsigned int size,
+            unsigned int option)
+  {
+    for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                      row  < size;
+                      row += gridDim.x * blockDim.x)
+    {
+      NumericT value = 0;
+      unsigned int row_end = row_indices[row+1];
+
+      switch (option)
+      {
+        case 0: //inf-norm
+          for (unsigned int i = row_indices[row]; i < row_end; ++i)
+            value = max(value, fabs(elements[i]));
+          break;
+
+        case 1: //1-norm
+          for (unsigned int i = row_indices[row]; i < row_end; ++i)
+            value += fabs(elements[i]);
+          break;
+
+        case 2: //2-norm
+          for (unsigned int i = row_indices[row]; i < row_end; ++i)
+            value += elements[i] * elements[i];
+          value = sqrt(value);
+          break;
+
+        case 3: //diagonal entry
+          for (unsigned int i = row_indices[row]; i < row_end; ++i)
+          {
+            if (column_indices[i] == row)
+            {
+              value = elements[i];
+              break;
+            }
+          }
+          break;
+
+        default:
+          break;
+      }
+      result[row] = value;
+    }
+  }
+
+
+  template<typename NumericT, unsigned int AligmentV>
+  void row_info(compressed_matrix<NumericT, AligmentV> const & mat,
+                vector_base<NumericT> & vec,
+                viennacl::linalg::detail::row_info_types info_selector)
+  {
+    csr_row_info_extractor_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+                                                viennacl::cuda_arg<unsigned int>(mat.handle2()),
+                                                viennacl::cuda_arg<NumericT>(mat.handle()),
+                                                viennacl::cuda_arg(vec),
+                                                static_cast<unsigned int>(mat.size1()),
+                                                static_cast<unsigned int>(info_selector)
+                                               );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel");
+  }
+
+  struct spmv_pure
+  {
+    template<typename NumericT>
+    __device__ static void apply(NumericT & result, NumericT alpha, NumericT Ax, NumericT beta) { result = Ax; }
+  };
+
+  struct spmv_alpha_beta
+  {
+    template<typename NumericT>
+    __device__ static void apply(NumericT & result, NumericT alpha, NumericT Ax, NumericT beta) { result = alpha * Ax + ((beta != 0) ? beta * result : 0); }
+  };
+
+} //namespace detail
+
+
+
+template<unsigned int SubWarpSizeV, typename AlphaBetaHandlerT, typename NumericT>
+__global__ void compressed_matrix_vec_mul_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          const NumericT * x,
+          unsigned int start_x,
+          unsigned int inc_x,
+          NumericT alpha,
+          NumericT * result,
+          unsigned int start_result,
+          unsigned int inc_result,
+          unsigned int size_result,
+          NumericT beta)
+{
+  __shared__ NumericT shared_elements[512];
+
+  const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
+  const unsigned int block_increment = blockDim.x * ((size_result - 1) / (gridDim.x * blockDim.x) + 1);
+  const unsigned int block_start = blockIdx.x * block_increment;
+  const unsigned int block_stop  = min(block_start + block_increment, size_result);
+
+  for (unsigned int row  = block_start + threadIdx.x / SubWarpSizeV;
+                    row  < block_stop;
+                    row += blockDim.x / SubWarpSizeV)
+  {
+    NumericT dot_prod = NumericT(0);
+    unsigned int row_end = row_indices[row+1];
+    for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV)
+      dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
+
+    shared_elements[threadIdx.x] = dot_prod;
+    if (1  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  1];
+    if (2  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  2];
+    if (4  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  4];
+    if (8  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  8];
+    if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
+
+    if (id_in_row == 0)
+      AlphaBetaHandlerT::apply(result[row * inc_result + start_result], alpha, shared_elements[threadIdx.x], beta);
+  }
+}
+
+
+template<typename AlphaBetaHandlerT, typename NumericT>
+__global__ void compressed_matrix_vec_mul_adaptive_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const unsigned int * row_blocks,
+          const NumericT * elements,
+          unsigned int num_blocks,
+          const NumericT * x,
+          unsigned int start_x,
+          unsigned int inc_x,
+          NumericT alpha,
+          NumericT * result,
+          unsigned int start_result,
+          unsigned int inc_result,
+          unsigned int size_result,
+          NumericT beta)
+{
+  __shared__ NumericT     shared_elements[1024];
+
+  for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
+  {
+    unsigned int row_start = row_blocks[block_id];
+    unsigned int row_stop  = row_blocks[block_id + 1];
+    unsigned int element_start = row_indices[row_start];
+    unsigned int element_stop = row_indices[row_stop];
+    unsigned int rows_to_process = row_stop - row_start;
+
+    if (rows_to_process > 1)  // CSR stream with one thread per row
+    {
+      // load to shared buffer:
+      for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
+        shared_elements[i - element_start] = elements[i] * x[column_indices[i] * inc_x + start_x];
+
+      __syncthreads();
+
+      // use one thread per row to sum:
+      for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
+      {
+        NumericT dot_prod = 0;
+        unsigned int thread_row_start = row_indices[row]     - element_start;
+        unsigned int thread_row_stop  = row_indices[row + 1] - element_start;
+        for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
+          dot_prod += shared_elements[i];
+        AlphaBetaHandlerT::apply(result[row * inc_result + start_result], alpha, dot_prod, beta);
+      }
+    }
+    // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
+    else // CSR vector for a single row
+    {
+      // load and sum to shared buffer:
+      shared_elements[threadIdx.x] = 0;
+      for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
+        shared_elements[threadIdx.x] += elements[i] * x[column_indices[i] * inc_x + start_x];
+
+      // reduction to obtain final result
+      for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+      {
+        __syncthreads();
+        if (threadIdx.x < stride)
+          shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
+      }
+
+      if (threadIdx.x == 0)
+        AlphaBetaHandlerT::apply(result[row_start * inc_result + start_result], alpha, shared_elements[0], beta);
+    }
+
+    __syncthreads();  // avoid race conditions
+  }
+}
+
+
+
+
+/** @brief Carries out matrix-vector multiplication with a compressed_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template<class NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & mat,
+               const viennacl::vector_base<NumericT> & vec,
+               NumericT alpha,
+                     viennacl::vector_base<NumericT> & result,
+               NumericT beta)
+{
+  static bool first = true;
+  static bool is_maxwell = false;
+
+  // check whether the CUDA device is from the Maxwell family.
+  // Only run once, because the query to the backend takes about the same time as a kernel launch (~15us), thus being too expensive to query each time.
+  //
+  // Note: This might result in non-optimal kernels being selected if multiple Maxwell- and non-Maxwell GPUs are available in the system and devices are switched at runtime.
+  //       However, this situation is certainly rare, hence the the benefits of this singleton outweigh the disadvantages encountered in such a corner case.
+  if (first)
+  {
+    cudaDeviceProp prop;
+    int device_index = 0;
+
+    cudaError_t err_flag = cudaGetDevice(&device_index);
+    if (err_flag == cudaSuccess)
+    {
+      err_flag = cudaGetDeviceProperties(&prop, device_index);
+      if (err_flag == cudaSuccess && prop.major >= 5)
+        is_maxwell = true;
+    }
+    first = false;
+  }
+
+  if (is_maxwell && double(mat.nnz()) / double(mat.size1()) > 6.4) // less than 10% of threads expected to idle
+  {
+    if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
+      compressed_matrix_vec_mul_kernel<8, detail::spmv_alpha_beta, NumericT><<<512, 256>>>(   // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
+                                                                    viennacl::cuda_arg<unsigned int>(mat.handle1()),
+                                                                    viennacl::cuda_arg<unsigned int>(mat.handle2()),
+                                                                    viennacl::cuda_arg<NumericT>(mat.handle()),
+                                                                    viennacl::cuda_arg(vec),
+                                                                    static_cast<unsigned int>(vec.start()),
+                                                                    static_cast<unsigned int>(vec.stride()),
+                                                                    alpha,
+                                                                    viennacl::cuda_arg(result),
+                                                                    static_cast<unsigned int>(result.start()),
+                                                                    static_cast<unsigned int>(result.stride()),
+                                                                    static_cast<unsigned int>(result.size()),
+                                                                    beta
+                                                                   );
+    else
+      compressed_matrix_vec_mul_kernel<8, detail::spmv_pure, NumericT><<<512, 256>>>(   // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
+                                                                    viennacl::cuda_arg<unsigned int>(mat.handle1()),
+                                                                    viennacl::cuda_arg<unsigned int>(mat.handle2()),
+                                                                    viennacl::cuda_arg<NumericT>(mat.handle()),
+                                                                    viennacl::cuda_arg(vec),
+                                                                    static_cast<unsigned int>(vec.start()),
+                                                                    static_cast<unsigned int>(vec.stride()),
+                                                                    alpha,
+                                                                    viennacl::cuda_arg(result),
+                                                                    static_cast<unsigned int>(result.start()),
+                                                                    static_cast<unsigned int>(result.stride()),
+                                                                    static_cast<unsigned int>(result.size()),
+                                                                    beta
+                                                                   );
+     VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel");
+  }
+  else if (!is_maxwell && double(mat.nnz()) / double(mat.size1()) > 12.0) // less than 25% of threads expected to idle
+  {
+    if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
+      compressed_matrix_vec_mul_kernel<16, detail::spmv_alpha_beta, NumericT><<<512, 256>>>(   // Fermi and Kepler prefer 16 threads per row (half-warp)
+                                                                   viennacl::cuda_arg<unsigned int>(mat.handle1()),
+                                                                   viennacl::cuda_arg<unsigned int>(mat.handle2()),
+                                                                   viennacl::cuda_arg<NumericT>(mat.handle()),
+                                                                   viennacl::cuda_arg(vec),
+                                                                   static_cast<unsigned int>(vec.start()),
+                                                                   static_cast<unsigned int>(vec.stride()),
+                                                                   alpha,
+                                                                   viennacl::cuda_arg(result),
+                                                                   static_cast<unsigned int>(result.start()),
+                                                                   static_cast<unsigned int>(result.stride()),
+                                                                   static_cast<unsigned int>(result.size()),
+                                                                   beta
+                                                                  );
+    else
+      compressed_matrix_vec_mul_kernel<16, detail::spmv_pure, NumericT><<<512, 256>>>(   // Fermi and Kepler prefer 16 threads per row (half-warp)
+                                                                   viennacl::cuda_arg<unsigned int>(mat.handle1()),
+                                                                   viennacl::cuda_arg<unsigned int>(mat.handle2()),
+                                                                   viennacl::cuda_arg<NumericT>(mat.handle()),
+                                                                   viennacl::cuda_arg(vec),
+                                                                   static_cast<unsigned int>(vec.start()),
+                                                                   static_cast<unsigned int>(vec.stride()),
+                                                                   alpha,
+                                                                   viennacl::cuda_arg(result),
+                                                                   static_cast<unsigned int>(result.start()),
+                                                                   static_cast<unsigned int>(result.stride()),
+                                                                   static_cast<unsigned int>(result.size()),
+                                                                   beta
+                                                                  );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel");
+  }
+  else
+  {
+    if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
+      compressed_matrix_vec_mul_adaptive_kernel<detail::spmv_alpha_beta><<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+                                                              viennacl::cuda_arg<unsigned int>(mat.handle2()),
+                                                              viennacl::cuda_arg<unsigned int>(mat.handle3()),
+                                                              viennacl::cuda_arg<NumericT>(mat.handle()),
+                                                              static_cast<unsigned int>(mat.blocks1()),
+                                                              viennacl::cuda_arg(vec),
+                                                              static_cast<unsigned int>(vec.start()),
+                                                              static_cast<unsigned int>(vec.stride()),
+                                                              alpha,
+                                                              viennacl::cuda_arg(result),
+                                                              static_cast<unsigned int>(result.start()),
+                                                              static_cast<unsigned int>(result.stride()),
+                                                              static_cast<unsigned int>(result.size()),
+                                                              beta
+                                                             );
+    else
+      compressed_matrix_vec_mul_adaptive_kernel<detail::spmv_pure><<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+                                                              viennacl::cuda_arg<unsigned int>(mat.handle2()),
+                                                              viennacl::cuda_arg<unsigned int>(mat.handle3()),
+                                                              viennacl::cuda_arg<NumericT>(mat.handle()),
+                                                              static_cast<unsigned int>(mat.blocks1()),
+                                                              viennacl::cuda_arg(vec),
+                                                              static_cast<unsigned int>(vec.start()),
+                                                              static_cast<unsigned int>(vec.stride()),
+                                                              alpha,
+                                                              viennacl::cuda_arg(result),
+                                                              static_cast<unsigned int>(result.start()),
+                                                              static_cast<unsigned int>(result.stride()),
+                                                              static_cast<unsigned int>(result.size()),
+                                                              beta
+                                                             );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_adaptive_kernel");
+  }
+}
+
+/** @brief Helper struct for accessing an element of a row- or column-major matrix.
+  *
+  * @param LayoutT   The layout tag: Either row_major or column_major
+  */
+template<typename LayoutT>
+struct mat_mult_matrix_index
+{
+  static __device__ unsigned int apply(unsigned int i, unsigned int j,
+                                unsigned int row_start, unsigned int row_inc,
+                                unsigned int col_start, unsigned int col_inc,
+                                unsigned int internal_rows, unsigned int internal_cols)
+  {
+    return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
+  }
+};
+
+/** \cond */
+template<>
+struct mat_mult_matrix_index<viennacl::column_major>
+{
+  static __device__ unsigned int apply(unsigned int i, unsigned int j,
+                                unsigned int row_start, unsigned int row_inc,
+                                unsigned int col_start, unsigned int col_inc,
+                                unsigned int internal_rows, unsigned int internal_cols)
+  {
+    return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
+  }
+};
+/** \endcond */
+
+
+template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
+__global__ void compressed_matrix_d_mat_mul_kernel(
+          const unsigned int * sp_mat_row_indices,
+          const unsigned int * sp_mat_col_indices,
+          const NumericT * sp_mat_elements,
+          const NumericT * d_mat,
+          unsigned int d_mat_row_start,
+          unsigned int d_mat_col_start,
+          unsigned int d_mat_row_inc,
+          unsigned int d_mat_col_inc,
+          unsigned int d_mat_row_size,
+          unsigned int d_mat_col_size,
+          unsigned int d_mat_internal_rows,
+          unsigned int d_mat_internal_cols,
+          NumericT * result,
+          unsigned int result_row_start,
+          unsigned int result_col_start,
+          unsigned int result_row_inc,
+          unsigned int result_col_inc,
+          unsigned int result_row_size,
+          unsigned int result_col_size,
+          unsigned int result_internal_rows,
+          unsigned int result_internal_cols)
+{
+  for (unsigned int row  = blockIdx.x; row  < result_row_size; row += gridDim.x)
+  {
+    unsigned int row_start = sp_mat_row_indices[row];
+    unsigned int row_end = sp_mat_row_indices[row+1];
+
+    for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
+    {
+      NumericT r = 0;
+
+      for (unsigned int k = row_start; k < row_end; k++)
+      {
+        unsigned int j = sp_mat_col_indices[k];
+        NumericT x = sp_mat_elements[k];
+        NumericT y = d_mat[ DMatIndexT::apply(j, col,
+                                              d_mat_row_start, d_mat_row_inc,
+                                              d_mat_col_start, d_mat_col_inc,
+                                              d_mat_internal_rows, d_mat_internal_cols) ];
+
+        r += x * y;
+      }
+
+      result[ResultIndexT::apply(row, col,
+                                 result_row_start, result_row_inc,
+                                 result_col_start, result_col_inc,
+                                 result_internal_rows, result_internal_cols)] = r;
+    }
+  }
+}
+
+
+/** @brief Carries out sparse_matrix-dense_matrix multiplication first matrix being compressed
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param sp_mat   The sparse matrix
+* @param d_mat    The dense matrix
+* @param result   The result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_mat,
+               const viennacl::matrix_base<NumericT> & d_mat,
+                     viennacl::matrix_base<NumericT> & result)
+{
+  if (d_mat.row_major() && result.row_major())
+  {
+    compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
+                                                  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+                                                   viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+                                                   viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+                                                   viennacl::cuda_arg(d_mat),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+                                                   viennacl::cuda_arg(result),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                  );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
+  }
+  else if (d_mat.row_major() && !result.row_major())
+  {
+    compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
+                                                  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+                                                   viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+                                                   viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+                                                   viennacl::cuda_arg(d_mat),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+                                                   viennacl::cuda_arg(result),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                  );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
+  }
+  else if (!d_mat.row_major() && result.row_major())
+  {
+    compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
+                                                  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+                                                   viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+                                                   viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+                                                   viennacl::cuda_arg(d_mat),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+                                                   viennacl::cuda_arg(result),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                  );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
+  }
+  else
+  {
+    compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
+                                                  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+                                                   viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+                                                   viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+                                                   viennacl::cuda_arg(d_mat),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+                                                   viennacl::cuda_arg(result),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                  );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
+  }
+}
+
+
+template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
+__global__ void compressed_matrix_d_tr_mat_mul_kernel(
+          const unsigned int * sp_mat_row_indices,
+          const unsigned int * sp_mat_col_indices,
+          const NumericT * sp_mat_elements,
+          const NumericT * d_mat,
+          unsigned int d_mat_row_start,
+          unsigned int d_mat_col_start,
+          unsigned int d_mat_row_inc,
+          unsigned int d_mat_col_inc,
+          unsigned int d_mat_row_size,
+          unsigned int d_mat_col_size,
+          unsigned int d_mat_internal_rows,
+          unsigned int d_mat_internal_cols,
+          NumericT * result,
+          unsigned int result_row_start,
+          unsigned int result_col_start,
+          unsigned int result_row_inc,
+          unsigned int result_col_inc,
+          unsigned int result_row_size,
+          unsigned int result_col_size,
+          unsigned int result_internal_rows,
+          unsigned int result_internal_cols)
+{
+  for (unsigned int row  = blockIdx.x; row  < result_row_size; row += gridDim.x)
+  {
+    unsigned int row_start = sp_mat_row_indices[row];
+    unsigned int row_end = sp_mat_row_indices[row+1];
+
+    for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
+    {
+      NumericT r = 0;
+
+      for (unsigned int k = row_start; k < row_end; k++)
+      {
+        unsigned int j = sp_mat_col_indices[k];
+        NumericT x = sp_mat_elements[k];
+        NumericT y = d_mat[ DMatIndexT::apply(col, j,
+                                              d_mat_row_start, d_mat_row_inc,
+                                              d_mat_col_start, d_mat_col_inc,
+                                              d_mat_internal_rows, d_mat_internal_cols) ];
+
+        r += x * y;
+      }
+
+      result [ ResultIndexT::apply(row, col,
+                                   result_row_start, result_row_inc,
+                                   result_col_start, result_col_inc,
+                                   result_internal_rows, result_internal_cols) ] = r;
+    }
+  }
+
+}
+
+/** @brief Carries out matrix-trans(matrix) multiplication first matrix being compressed
+*          and the second transposed
+*
+* Implementation of the convenience expression result = prod(sp_mat, d_mat);
+*
+* @param sp_mat             The sparse matrix
+* @param d_mat              The transposed dense matrix proxy
+* @param result             The result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_mat,
+               const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+                                                  const viennacl::matrix_base<NumericT>,
+                                                  viennacl::op_trans > & d_mat,
+                viennacl::matrix_base<NumericT> & result)
+{
+
+  if (d_mat.lhs().row_major() && result.row_major())
+  {
+    compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
+                                                (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+                                                 viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+                                                 viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+                                                 viennacl::cuda_arg(d_mat.lhs()),
+                                                 static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
+
+                                                 viennacl::cuda_arg(result),
+                                                 static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
+  }
+  else if (d_mat.lhs().row_major() && !result.row_major())
+  {
+    compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
+                                                (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+                                                 viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+                                                 viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+                                                 viennacl::cuda_arg(d_mat.lhs()),
+                                                 static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
+
+                                                 viennacl::cuda_arg(result),
+                                                 static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
+  }
+  else if (!d_mat.lhs().row_major() && result.row_major())
+  {
+    compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
+                                                (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+                                                 viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+                                                 viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+                                                 viennacl::cuda_arg(d_mat.lhs()),
+                                                 static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
+
+                                                 viennacl::cuda_arg(result),
+                                                 static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
+  }
+  else
+  {
+    compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
+                                                (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+                                                 viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+                                                 viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+                                                 viennacl::cuda_arg(d_mat.lhs()),
+                                                 static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
+
+                                                 viennacl::cuda_arg(result),
+                                                 static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
+  }
+}
+
+
+//
+// triangular solves for compressed_matrix
+//
+
+template<typename NumericT>
+__global__ void compressed_matrix_diagonal_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          NumericT * result,
+          unsigned int size)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < size;
+                    row += gridDim.x * blockDim.x)
+  {
+    NumericT diag = NumericT(0);
+    unsigned int row_end = row_indices[row+1];
+    for (unsigned int i = row_indices[row]; i < row_end; ++i)
+    {
+      unsigned int col_index = column_indices[i];
+      if (col_index == row)
+      {
+        diag = elements[i];
+        break;
+      }
+    }
+    result[row] = diag;
+  }
+}
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat    The matrix
+* @param vec    The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const SparseMatrixT & mat,
+              viennacl::vector_base<NumericT> & vec,
+              viennacl::linalg::unit_lower_tag)
+{
+  csr_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+                                         viennacl::cuda_arg<unsigned int>(mat.handle2()),
+                                         viennacl::cuda_arg<NumericT>(mat.handle()),
+                                         viennacl::cuda_arg(vec),
+                                         static_cast<unsigned int>(mat.size1())
+                                        );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_forward_kernel");
+}
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat    The matrix
+* @param vec    The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const SparseMatrixT & mat,
+              viennacl::vector_base<NumericT> & vec,
+              viennacl::linalg::lower_tag)
+{
+  csr_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+                                    viennacl::cuda_arg<unsigned int>(mat.handle2()),
+                                    viennacl::cuda_arg<NumericT>(mat.handle()),
+                                    viennacl::cuda_arg(vec),
+                                    static_cast<unsigned int>(mat.size1())
+                                   );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_forward_kernel");
+}
+
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat    The matrix
+* @param vec    The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const SparseMatrixT & mat,
+              viennacl::vector_base<NumericT> & vec,
+              viennacl::linalg::unit_upper_tag)
+{
+  csr_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+                                    viennacl::cuda_arg<unsigned int>(mat.handle2()),
+                                    viennacl::cuda_arg<NumericT>(mat.handle()),
+                                    viennacl::cuda_arg(vec),
+                                    static_cast<unsigned int>(mat.size1())
+                                   );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_backward_kernel");
+}
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat    The matrix
+* @param vec    The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const SparseMatrixT & mat,
+              viennacl::vector_base<NumericT> & vec,
+              viennacl::linalg::upper_tag)
+{
+  csr_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+                                    viennacl::cuda_arg<unsigned int>(mat.handle2()),
+                                    viennacl::cuda_arg<NumericT>(mat.handle()),
+                                    viennacl::cuda_arg(vec),
+                                    static_cast<unsigned int>(mat.size1())
+                                   );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_backward_kernel");
+}
+
+
+
+// transposed
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat    The matrix
+* @param vec    The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const matrix_expression<const SparseMatrixT, const SparseMatrixT, op_trans> & mat,
+              viennacl::vector_base<NumericT> & vec,
+              viennacl::linalg::unit_lower_tag)
+{
+  csr_trans_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
+                                          viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
+                                          viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
+                                          viennacl::cuda_arg(vec),
+                                          static_cast<unsigned int>(mat.lhs().size1())
+                                         );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_forward_kernel");
+}
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat    The matrix
+* @param vec    The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const matrix_expression<const SparseMatrixT, const SparseMatrixT, op_trans> & mat,
+              viennacl::vector_base<NumericT> & vec,
+              viennacl::linalg::lower_tag)
+{
+  viennacl::vector<NumericT> diagonal(vec.size());
+
+  compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
+                                                viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
+                                                viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
+                                                viennacl::cuda_arg(diagonal),
+                                                static_cast<unsigned int>(mat.size1())
+                                               );
+
+  csr_trans_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
+                                          viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
+                                          viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
+                                          viennacl::cuda_arg(diagonal),
+                                          viennacl::cuda_arg(vec),
+                                          static_cast<unsigned int>(mat.lhs().size1())
+                                         );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_forward_kernel");
+}
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat    The matrix
+* @param vec    The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const matrix_expression<const SparseMatrixT, const SparseMatrixT, op_trans> & mat,
+              viennacl::vector_base<NumericT> & vec,
+              viennacl::linalg::unit_upper_tag)
+{
+  csr_trans_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
+                                                viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
+                                                viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
+                                                viennacl::cuda_arg(vec),
+                                                static_cast<unsigned int>(mat.lhs().size1())
+                                              );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_backward_kernel");
+}
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat    The matrix
+* @param vec    The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const matrix_expression<const SparseMatrixT, const SparseMatrixT, op_trans> & mat,
+              viennacl::vector_base<NumericT> & vec,
+              viennacl::linalg::upper_tag)
+{
+  viennacl::vector<NumericT> diagonal(vec.size());
+
+  compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
+                                                viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
+                                                viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
+                                                viennacl::cuda_arg(diagonal),
+                                                static_cast<unsigned int>(mat.size1())
+                                               );
+
+  csr_trans_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
+                                           viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
+                                           viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
+                                           viennacl::cuda_arg(diagonal),
+                                           viennacl::cuda_arg(vec),
+                                           static_cast<unsigned int>(mat.lhs().size1())
+                                          );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_backward_kernel");
+}
+
+namespace detail
+{
+  //
+  // block solves
+  //
+  template<typename NumericT, unsigned int AlignmentV>
+  void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+                                                   const compressed_matrix<NumericT, AlignmentV>,
+                                                   op_trans> & L,
+                           viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+                           vector_base<NumericT> const & /* L_diagonal */,  //ignored
+                           vector_base<NumericT> & vec,
+                           viennacl::linalg::unit_lower_tag)
+  {
+    csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(L.lhs().handle1()),
+                                                         viennacl::cuda_arg<unsigned int>(L.lhs().handle2()),
+                                                         viennacl::cuda_arg<NumericT>(L.lhs().handle()),
+                                                         viennacl::cuda_arg<unsigned int>(block_indices),
+                                                         viennacl::cuda_arg(vec),
+                                                         static_cast<unsigned int>(L.lhs().size1())
+                                                        );
+  }
+
+
+  template<typename NumericT, unsigned int AlignmentV>
+  void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+                                                   const compressed_matrix<NumericT, AlignmentV>,
+                                                   op_trans> & U,
+                           viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+                           vector_base<NumericT> const & U_diagonal,
+                           vector_base<NumericT> & vec,
+                           viennacl::linalg::upper_tag)
+  {
+    csr_block_trans_lu_backward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(U.lhs().handle1()),
+                                                     viennacl::cuda_arg<unsigned int>(U.lhs().handle2()),
+                                                     viennacl::cuda_arg<NumericT>(U.lhs().handle()),
+                                                     viennacl::cuda_arg(U_diagonal),
+                                                     viennacl::cuda_arg<unsigned int>(block_indices),
+                                                     viennacl::cuda_arg(vec),
+                                                     static_cast<unsigned int>(U.lhs().size1())
+                                                    );
+  }
+
+
+}
+
+
+//
+// Compressed Compressed Matrix
+//
+
+template<typename NumericT>
+__global__ void compressed_compressed_matrix_vec_mul_kernel(
+          const unsigned int * row_jumper,
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          unsigned int nonzero_rows,
+          const NumericT * x,
+          unsigned int start_x,
+          unsigned int inc_x,
+          NumericT alpha,
+          NumericT * result,
+          unsigned int start_result,
+          unsigned int inc_result,
+          unsigned int size_result,
+          NumericT beta)
+{
+  for (unsigned int i  = blockDim.x * blockIdx.x + threadIdx.x;
+                    i  < nonzero_rows;
+                    i += gridDim.x * blockDim.x)
+  {
+    NumericT dot_prod = NumericT(0);
+    unsigned int row_end = row_jumper[i+1];
+    for (unsigned int j = row_jumper[i]; j < row_end; ++j)
+      dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
+
+    unsigned int index = row_indices[i] * inc_result + start_result;
+    if (beta != 0) result[index] += alpha * dot_prod;
+    else           result[index]  = alpha * dot_prod;
+  }
+}
+
+
+/** @brief Carries out matrix-vector multiplication with a compressed_compressed_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template<typename NumericT>
+void prod_impl(const viennacl::compressed_compressed_matrix<NumericT> & mat,
+               const viennacl::vector_base<NumericT> & vec,
+               NumericT alpha,
+                     viennacl::vector_base<NumericT> & result,
+               NumericT beta)
+{
+  if (beta < 0 || beta > 0)
+    viennacl::linalg::cuda::av(result, result, beta, 1, false, false);
+  else
+    result.clear();
+
+  compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+                                                            viennacl::cuda_arg<unsigned int>(mat.handle3()),
+                                                            viennacl::cuda_arg<unsigned int>(mat.handle2()),
+                                                            viennacl::cuda_arg<NumericT>(mat.handle()),
+                                                            static_cast<unsigned int>(mat.nnz1()),
+                                                            viennacl::cuda_arg(vec),
+                                                            static_cast<unsigned int>(vec.start()),
+                                                            static_cast<unsigned int>(vec.stride()),
+                                                            alpha,
+                                                            viennacl::cuda_arg(result),
+                                                            static_cast<unsigned int>(result.start()),
+                                                            static_cast<unsigned int>(result.stride()),
+                                                            static_cast<unsigned int>(result.size()),
+                                                            beta
+                                                           );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_compressed_matrix_vec_mul_kernel");
+}
+
+//
+// Coordinate Matrix
+//
+
+
+namespace detail
+{
+
+  template<typename NumericT>
+  __global__ void coo_row_info_extractor( const unsigned int * coords, //(row_index, column_index)
+                                          const NumericT * elements,
+                                          const unsigned int * group_boundaries,
+                                          NumericT * result,
+                                          unsigned int option)
+  {
+    __shared__ unsigned int shared_rows[128];
+    __shared__ NumericT inter_results[128];
+
+    uint2 tmp;
+    NumericT val;
+    unsigned int last_index  = blockDim.x - 1;
+    unsigned int group_start = group_boundaries[blockIdx.x];
+    unsigned int group_end   = group_boundaries[blockIdx.x + 1];
+    unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+    unsigned int local_index = 0;
+
+    for (unsigned int k = 0; k < k_end; ++k)
+    {
+      local_index = group_start + k * blockDim.x + threadIdx.x;
+
+      tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+      val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
+
+      //check for carry from previous loop run:
+      if (threadIdx.x == 0 && k > 0)
+      {
+        if (tmp.x == shared_rows[last_index])
+        {
+          switch (option)
+          {
+            case 0: //inf-norm
+            case 3: //diagonal entry
+              val = max(val, fabs(inter_results[last_index]));
+              break;
+
+            case 1: //1-norm
+              val = fabs(val) + inter_results[last_index];
+              break;
+
+            case 2: //2-norm
+              val = sqrt(val * val + inter_results[last_index]);
+              break;
+
+            default:
+              break;
+          }
+        }
+        else
+        {
+          switch (option)
+          {
+            case 0: //inf-norm
+            case 1: //1-norm
+            case 3: //diagonal entry
+              result[shared_rows[last_index]] = inter_results[last_index];
+              break;
+
+            case 2: //2-norm
+              result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
+            default:
+              break;
+          }
+        }
+      }
+
+      //segmented parallel reduction begin
+      __syncthreads();
+      shared_rows[threadIdx.x] = tmp.x;
+      switch (option)
+      {
+        case 0:
+        case 3:
+          inter_results[threadIdx.x] = val;
+          break;
+        case 1:
+          inter_results[threadIdx.x] = fabs(val);
+          break;
+        case 2:
+          inter_results[threadIdx.x] = val * val;
+        default:
+          break;
+      }
+      __syncthreads();
+
+      for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+      {
+        NumericT left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+        __syncthreads();
+        switch (option)
+        {
+          case 0: //inf-norm
+          case 3: //diagonal entry
+            inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left);
+            break;
+
+          case 1: //1-norm
+            inter_results[threadIdx.x] += left;
+            break;
+
+          case 2: //2-norm
+            inter_results[threadIdx.x] += left;
+            break;
+
+          default:
+            break;
+        }
+        __syncthreads();
+      }
+      //segmented parallel reduction end
+
+      if (threadIdx.x != last_index &&
+          shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
+          inter_results[threadIdx.x] != 0)
+      {
+        result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
+      }
+
+      __syncthreads();
+    } //for k
+
+    if (local_index + 1 == group_end && inter_results[threadIdx.x] != 0)
+      result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
+  }
+
+  template<typename NumericT, unsigned int AlignmentV>
+  void row_info(coordinate_matrix<NumericT, AlignmentV> const & mat,
+                vector_base<NumericT> & vec,
+                viennacl::linalg::detail::row_info_types info_selector)
+  {
+    coo_row_info_extractor<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle12()),
+                                         viennacl::cuda_arg<NumericT>(mat.handle()),
+                                         viennacl::cuda_arg<unsigned int>(mat.handle3()),
+                                         viennacl::cuda_arg(vec),
+                                         static_cast<unsigned int>(info_selector)
+                                        );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("coo_row_info_extractor");
+  }
+
+} //namespace detail
+
+
+template<typename NumericT>
+__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
+                                                 const NumericT * elements,
+                                                 const unsigned int * group_boundaries,
+                                                 const NumericT * x,
+                                                 unsigned int start_x,
+                                                 unsigned int inc_x,
+                                                 NumericT alpha,
+                                                       NumericT * result,
+                                                 unsigned int start_result,
+                                                 unsigned int inc_result,
+                                                 NumericT beta)
+{
+  __shared__ unsigned int shared_rows[128];
+  __shared__ NumericT inter_results[128];
+
+  uint2 tmp;
+  NumericT val;
+  unsigned int group_start = group_boundaries[blockIdx.x];
+  unsigned int group_end   = group_boundaries[blockIdx.x + 1];
+  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+  unsigned int local_index = 0;
+
+  for (unsigned int k = 0; k < k_end; ++k)
+  {
+    local_index = group_start + k * blockDim.x + threadIdx.x;
+
+    tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+    val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
+
+    //check for carry from previous loop run:
+    if (threadIdx.x == 0 && k > 0)
+    {
+      if (tmp.x == shared_rows[blockDim.x-1])
+        val += inter_results[blockDim.x-1];
+      else if (beta != 0)
+        result[shared_rows[blockDim.x-1] * inc_result + start_result] += alpha * inter_results[blockDim.x-1];
+      else
+        result[shared_rows[blockDim.x-1] * inc_result + start_result]  = alpha * inter_results[blockDim.x-1];
+    }
+
+    //segmented parallel reduction begin
+    __syncthreads();
+    shared_rows[threadIdx.x] = tmp.x;
+    inter_results[threadIdx.x] = val;
+    NumericT left = 0;
+    __syncthreads();
+
+    for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+    {
+      left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+      __syncthreads();
+      inter_results[threadIdx.x] += left;
+      __syncthreads();
+    }
+    //segmented parallel reduction end
+
+    if (local_index < group_end - 1 && threadIdx.x < blockDim.x-1 &&
+        shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
+    {
+      if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x];
+      else           result[tmp.x * inc_result + start_result]  = alpha * inter_results[threadIdx.x];
+    }
+
+    __syncthreads();
+  } //for k
+
+  if (local_index + 1 == group_end) {
+    if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x];
+    else           result[tmp.x * inc_result + start_result]  = alpha * inter_results[threadIdx.x];
+  }
+}
+
+
+/** @brief Carries out matrix-vector multiplication with a coordinate_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & mat,
+               const viennacl::vector_base<NumericT> & vec,
+               NumericT alpha,
+                     viennacl::vector_base<NumericT> & result,
+               NumericT beta)
+{
+  if (beta < 0 || beta > 0)
+    viennacl::linalg::cuda::av(result, result, beta, 1, false, false);
+  else
+    result.clear();
+
+  coordinate_matrix_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle12()),
+                                                viennacl::cuda_arg<NumericT>(mat.handle()),
+                                                viennacl::cuda_arg<unsigned int>(mat.handle3()),
+                                                viennacl::cuda_arg(vec),
+                                                static_cast<unsigned int>(vec.start()),
+                                                static_cast<unsigned int>(vec.stride()),
+                                                alpha,
+                                                viennacl::cuda_arg(result),
+                                                static_cast<unsigned int>(result.start()),
+                                                static_cast<unsigned int>(result.stride()),
+                                                beta
+                                               );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_vec_mul_kernel");
+}
+
+
+
+
+template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
+__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
+                                                   const NumericT * elements,
+                                                   const unsigned int * group_boundaries,
+                                                   const NumericT * d_mat,
+                                                   unsigned int d_mat_row_start,
+                                                   unsigned int d_mat_col_start,
+                                                   unsigned int d_mat_row_inc,
+                                                   unsigned int d_mat_col_inc,
+                                                   unsigned int d_mat_row_size,
+                                                   unsigned int d_mat_col_size,
+                                                   unsigned int d_mat_internal_rows,
+                                                   unsigned int d_mat_internal_cols,
+                                                   NumericT * result,
+                                                   unsigned int result_row_start,
+                                                   unsigned int result_col_start,
+                                                   unsigned int result_row_inc,
+                                                   unsigned int result_col_inc,
+                                                   unsigned int result_row_size,
+                                                   unsigned int result_col_size,
+                                                   unsigned int result_internal_rows,
+                                                   unsigned int result_internal_cols)
+{
+  __shared__ unsigned int shared_rows[128];
+  __shared__ NumericT inter_results[128];
+
+  uint2 tmp;
+  NumericT val;
+  unsigned int group_start = group_boundaries[blockIdx.x];
+  unsigned int group_end   = group_boundaries[blockIdx.x + 1];
+  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+  unsigned int local_index = 0;
+
+  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
+  {
+    for (unsigned int k = 0; k < k_end; ++k)
+    {
+      local_index = group_start + k * blockDim.x + threadIdx.x;
+
+      tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+      val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
+                                                                                        d_mat_row_start, d_mat_row_inc,
+                                                                                        d_mat_col_start, d_mat_col_inc,
+                                                                                        d_mat_internal_rows, d_mat_internal_cols) ] : 0;
+
+      //check for carry from previous loop run:
+      if (threadIdx.x == 0 && k > 0)
+      {
+        if (tmp.x == shared_rows[blockDim.x-1])
+          val += inter_results[blockDim.x-1];
+        else
+          result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
+                                     result_row_start, result_row_inc,
+                                     result_col_start, result_col_inc,
+                                     result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
+      }
+
+      //segmented parallel reduction begin
+      __syncthreads();
+      shared_rows[threadIdx.x] = tmp.x;
+      inter_results[threadIdx.x] = val;
+      NumericT left = 0;
+      __syncthreads();
+
+      for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+      {
+        left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+        __syncthreads();
+        inter_results[threadIdx.x] += left;
+        __syncthreads();
+      }
+      //segmented parallel reduction end
+
+      if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
+          shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
+      {
+        result[ResultIndexT::apply(tmp.x, result_col,
+                                   result_row_start, result_row_inc,
+                                   result_col_start, result_col_inc,
+                                   result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
+      }
+
+      __syncthreads();
+    } //for k
+
+    if (local_index + 1 == group_end)
+      result[ResultIndexT::apply(tmp.x, result_col,
+                                 result_row_start, result_row_inc,
+                                 result_col_start, result_col_inc,
+                                 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
+  }
+}
+
+
+/** @brief Carries out Compressed Matrix(COO)-Dense Matrix multiplication
+*
+* Implementation of the convenience expression result = prod(sp_mat, d_mat);
+*
+* @param sp_mat     The Sparse Matrix (Coordinate format)
+* @param d_mat      The Dense Matrix
+* @param result     The Result Matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & sp_mat,
+               const viennacl::matrix_base<NumericT> & d_mat,
+                     viennacl::matrix_base<NumericT> & result)
+{
+  if (d_mat.row_major() && result.row_major())
+  {
+    coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
+                                                  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
+                                                   viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+                                                   viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
+
+                                                   viennacl::cuda_arg(d_mat),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+                                                   viennacl::cuda_arg(result),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                   );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
+  }
+  else if (d_mat.row_major() && !result.row_major())
+  {
+    coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
+                                                  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
+                                                   viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+                                                   viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
+
+                                                   viennacl::cuda_arg(d_mat),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+                                                   viennacl::cuda_arg(result),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                   );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
+  }
+  else if (!d_mat.row_major() && result.row_major())
+  {
+    coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
+                                                  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
+                                                   viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+                                                   viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
+
+                                                   viennacl::cuda_arg(d_mat),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+                                                   viennacl::cuda_arg(result),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                   );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
+  }
+  else
+  {
+    coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
+                                                  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
+                                                   viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+                                                   viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
+
+                                                   viennacl::cuda_arg(d_mat),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+                                                   viennacl::cuda_arg(result),
+                                                   static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                   );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
+  }
+
+}
+
+template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
+__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
+                                                     const NumericT * elements,
+                                                     const unsigned int * group_boundaries,
+                                                     const NumericT * d_mat,
+                                                     unsigned int d_mat_row_start,
+                                                     unsigned int d_mat_col_start,
+                                                     unsigned int d_mat_row_inc,
+                                                     unsigned int d_mat_col_inc,
+                                                     unsigned int d_mat_row_size,
+                                                     unsigned int d_mat_col_size,
+                                                     unsigned int d_mat_internal_rows,
+                                                     unsigned int d_mat_internal_cols,
+                                                     NumericT * result,
+                                                     unsigned int result_row_start,
+                                                     unsigned int result_col_start,
+                                                     unsigned int result_row_inc,
+                                                     unsigned int result_col_inc,
+                                                     unsigned int result_row_size,
+                                                     unsigned int result_col_size,
+                                                     unsigned int result_internal_rows,
+                                                     unsigned int result_internal_cols)
+{
+  __shared__ unsigned int shared_rows[128];
+  __shared__ NumericT inter_results[128];
+
+  uint2 tmp;
+  NumericT val;
+  unsigned int group_start = group_boundaries[blockIdx.x];
+  unsigned int group_end   = group_boundaries[blockIdx.x + 1];
+  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+  uns

<TRUNCATED>

[21/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/iterative_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/iterative_operations.hpp
new file mode 100644
index 0000000..ee6626c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/iterative_operations.hpp
@@ -0,0 +1,880 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/iterative_operations.hpp
+    @brief Implementations of specialized kernels for fast iterative solvers using OpenMP on the CPU
+*/
+
+#include <cmath>
+#include <algorithm>  //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/detail/op_applier.hpp"
+#include "viennacl/traits/stride.hpp"
+
+
+// Minimum vector size for using OpenMP on vector operations:
+#ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE
+  #define VIENNACL_OPENMP_VECTOR_MIN_SIZE  5000
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+namespace detail
+{
+  /** @brief Implementation of a fused matrix-vector product with a compressed_matrix for an efficient pipelined CG algorithm.
+    *
+    * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+    *   Ap = prod(A, p);
+    * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+    */
+  template<typename NumericT>
+  void pipelined_prod_impl(compressed_matrix<NumericT> const & A,
+                           vector_base<NumericT> const & p,
+                           vector_base<NumericT> & Ap,
+                           NumericT const * r0star,
+                           vector_base<NumericT> & inner_prod_buffer,
+                           vcl_size_t buffer_chunk_size,
+                           vcl_size_t buffer_chunk_offset)
+  {
+    typedef NumericT        value_type;
+
+    value_type         * Ap_buf      = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);
+    value_type   const *  p_buf      = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);
+    value_type   const * elements    = detail::extract_raw_pointer<value_type>(A.handle());
+    unsigned int const *  row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+    unsigned int const *  col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+    value_type         * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+    value_type inner_prod_ApAp = 0;
+    value_type inner_prod_pAp = 0;
+    value_type inner_prod_Ap_r0star = 0;
+    for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+    {
+      value_type dot_prod = 0;
+      value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row
+
+      vcl_size_t row_end = row_buffer[row+1];
+      for (vcl_size_t i = row_buffer[row]; i < row_end; ++i)
+        dot_prod += elements[i] * p_buf[col_buffer[i]];
+
+      // update contributions for the inner products (Ap, Ap) and (p, Ap)
+      Ap_buf[static_cast<vcl_size_t>(row)] = dot_prod;
+      inner_prod_ApAp += dot_prod * dot_prod;
+      inner_prod_pAp  += val_p_diag * dot_prod;
+      inner_prod_Ap_r0star += r0star ? dot_prod * r0star[static_cast<vcl_size_t>(row)] : value_type(0);
+    }
+
+    data_buffer[    buffer_chunk_size] = inner_prod_ApAp;
+    data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
+    if (r0star)
+      data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
+  }
+
+
+
+  /** @brief Implementation of a fused matrix-vector product with a coordinate_matrix for an efficient pipelined CG algorithm.
+    *
+    * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+    *   Ap = prod(A, p);
+    * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+    */
+  template<typename NumericT>
+  void pipelined_prod_impl(coordinate_matrix<NumericT> const & A,
+                           vector_base<NumericT> const & p,
+                           vector_base<NumericT> & Ap,
+                           NumericT const * r0star,
+                           vector_base<NumericT> & inner_prod_buffer,
+                           vcl_size_t buffer_chunk_size,
+                           vcl_size_t buffer_chunk_offset)
+  {
+    typedef NumericT        value_type;
+
+    value_type         * Ap_buf       = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
+    value_type   const *  p_buf       = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
+    value_type   const * elements     = detail::extract_raw_pointer<value_type>(A.handle());
+    unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(A.handle12());
+    value_type         * data_buffer  = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+    // flush result buffer (cannot be expected to be zero)
+    for (vcl_size_t i = 0; i< Ap.size(); ++i)
+      Ap_buf[i] = 0;
+
+    // matrix-vector product with a general COO format
+    for (vcl_size_t i = 0; i < A.nnz(); ++i)
+      Ap_buf[coord_buffer[2*i]] += elements[i] * p_buf[coord_buffer[2*i+1]];
+
+    // computing the inner products (Ap, Ap) and (p, Ap):
+    // Note: The COO format does not allow to inject the subsequent operations into the matrix-vector product, because row and column ordering assumptions are too weak
+    value_type inner_prod_ApAp = 0;
+    value_type inner_prod_pAp = 0;
+    value_type inner_prod_Ap_r0star = 0;
+    for (vcl_size_t i = 0; i<Ap.size(); ++i)
+    {
+      NumericT value_Ap = Ap_buf[i];
+      NumericT value_p  =  p_buf[i];
+
+      inner_prod_ApAp += value_Ap * value_Ap;
+      inner_prod_pAp  += value_Ap * value_p;
+      inner_prod_Ap_r0star += r0star ? value_Ap * r0star[i] : value_type(0);
+    }
+
+    data_buffer[    buffer_chunk_size] = inner_prod_ApAp;
+    data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
+    if (r0star)
+      data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
+  }
+
+
+  /** @brief Implementation of a fused matrix-vector product with an ell_matrix for an efficient pipelined CG algorithm.
+    *
+    * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+    *   Ap = prod(A, p);
+    * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+    */
+  template<typename NumericT>
+  void pipelined_prod_impl(ell_matrix<NumericT> const & A,
+                           vector_base<NumericT> const & p,
+                           vector_base<NumericT> & Ap,
+                           NumericT const * r0star,
+                           vector_base<NumericT> & inner_prod_buffer,
+                           vcl_size_t buffer_chunk_size,
+                           vcl_size_t buffer_chunk_offset)
+  {
+    typedef NumericT     value_type;
+
+    value_type         * Ap_buf       = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
+    value_type   const *  p_buf       = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
+    value_type   const * elements     = detail::extract_raw_pointer<value_type>(A.handle());
+    unsigned int const * coords       = detail::extract_raw_pointer<unsigned int>(A.handle2());
+    value_type         * data_buffer  = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+    value_type inner_prod_ApAp = 0;
+    value_type inner_prod_pAp = 0;
+    value_type inner_prod_Ap_r0star = 0;
+    for (vcl_size_t row = 0; row < A.size1(); ++row)
+    {
+      value_type sum = 0;
+      value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row
+
+      for (unsigned int item_id = 0; item_id < A.internal_maxnnz(); ++item_id)
+      {
+        vcl_size_t offset = row + item_id * A.internal_size1();
+        value_type val = elements[offset];
+
+        if (val)
+          sum += (p_buf[coords[offset]] * val);
+      }
+
+      Ap_buf[row] = sum;
+      inner_prod_ApAp += sum * sum;
+      inner_prod_pAp  += val_p_diag * sum;
+      inner_prod_Ap_r0star += r0star ? sum * r0star[row] : value_type(0);
+    }
+
+    data_buffer[    buffer_chunk_size] = inner_prod_ApAp;
+    data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
+    if (r0star)
+      data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
+  }
+
+
+  /** @brief Implementation of a fused matrix-vector product with an sliced_ell_matrix for an efficient pipelined CG algorithm.
+    *
+    * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+    *   Ap = prod(A, p);
+    * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+    */
+  template<typename NumericT, typename IndexT>
+  void pipelined_prod_impl(sliced_ell_matrix<NumericT, IndexT> const & A,
+                           vector_base<NumericT> const & p,
+                           vector_base<NumericT> & Ap,
+                           NumericT const * r0star,
+                           vector_base<NumericT> & inner_prod_buffer,
+                           vcl_size_t buffer_chunk_size,
+                           vcl_size_t buffer_chunk_offset)
+  {
+    typedef NumericT     value_type;
+
+    value_type       * Ap_buf            = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
+    value_type const *  p_buf            = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
+    value_type const * elements          = detail::extract_raw_pointer<value_type>(A.handle());
+    IndexT     const * columns_per_block = detail::extract_raw_pointer<IndexT>(A.handle1());
+    IndexT     const * column_indices    = detail::extract_raw_pointer<IndexT>(A.handle2());
+    IndexT     const * block_start       = detail::extract_raw_pointer<IndexT>(A.handle3());
+    value_type         * data_buffer     = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+    vcl_size_t num_blocks = A.size1() / A.rows_per_block() + 1;
+    std::vector<value_type> result_values(A.rows_per_block());
+
+    value_type inner_prod_ApAp = 0;
+    value_type inner_prod_pAp = 0;
+    value_type inner_prod_Ap_r0star = 0;
+    for (vcl_size_t block_idx = 0; block_idx < num_blocks; ++block_idx)
+    {
+      vcl_size_t current_columns_per_block = columns_per_block[block_idx];
+
+      for (vcl_size_t i=0; i<result_values.size(); ++i)
+        result_values[i] = 0;
+
+      for (IndexT column_entry_index = 0;
+                  column_entry_index < current_columns_per_block;
+                ++column_entry_index)
+      {
+        vcl_size_t stride_start = block_start[block_idx] + column_entry_index * A.rows_per_block();
+        // Note: This for-loop may be unrolled by hand for exploiting vectorization
+        //       Careful benchmarking recommended first, memory channels may be saturated already!
+        for (IndexT row_in_block = 0; row_in_block < A.rows_per_block(); ++row_in_block)
+        {
+          value_type val = elements[stride_start + row_in_block];
+
+          result_values[row_in_block] += val ? p_buf[column_indices[stride_start + row_in_block]] * val : 0;
+        }
+      }
+
+      vcl_size_t first_row_in_matrix = block_idx * A.rows_per_block();
+      for (IndexT row_in_block = 0; row_in_block < A.rows_per_block(); ++row_in_block)
+      {
+        vcl_size_t row = first_row_in_matrix + row_in_block;
+        if (row < Ap.size())
+        {
+          value_type row_result = result_values[row_in_block];
+
+          Ap_buf[row] = row_result;
+          inner_prod_ApAp += row_result * row_result;
+          inner_prod_pAp  += p_buf[row] * row_result;
+          inner_prod_Ap_r0star += r0star ? row_result * r0star[row] : value_type(0);
+        }
+      }
+    }
+
+    data_buffer[    buffer_chunk_size] = inner_prod_ApAp;
+    data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
+    if (r0star)
+      data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
+  }
+
+
+  /** @brief Implementation of a fused matrix-vector product with an hyb_matrix for an efficient pipelined CG algorithm.
+    *
+    * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+    *   Ap = prod(A, p);
+    * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+    */
+  template<typename NumericT>
+  void pipelined_prod_impl(hyb_matrix<NumericT> const & A,
+                           vector_base<NumericT> const & p,
+                           vector_base<NumericT> & Ap,
+                           NumericT const * r0star,
+                           vector_base<NumericT> & inner_prod_buffer,
+                           vcl_size_t buffer_chunk_size,
+                           vcl_size_t buffer_chunk_offset)
+  {
+    typedef NumericT     value_type;
+    typedef unsigned int index_type;
+
+    value_type       * Ap_buf            = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
+    value_type const *  p_buf            = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
+    value_type const * elements          = detail::extract_raw_pointer<value_type>(A.handle());
+    index_type const * coords            = detail::extract_raw_pointer<index_type>(A.handle2());
+    value_type const * csr_elements      = detail::extract_raw_pointer<value_type>(A.handle5());
+    index_type const * csr_row_buffer    = detail::extract_raw_pointer<index_type>(A.handle3());
+    index_type const * csr_col_buffer    = detail::extract_raw_pointer<index_type>(A.handle4());
+    value_type         * data_buffer     = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+    value_type inner_prod_ApAp = 0;
+    value_type inner_prod_pAp = 0;
+    value_type inner_prod_Ap_r0star = 0;
+    for (vcl_size_t row = 0; row < A.size1(); ++row)
+    {
+      value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row
+      value_type sum = 0;
+
+      //
+      // Part 1: Process ELL part
+      //
+      for (index_type item_id = 0; item_id < A.internal_ellnnz(); ++item_id)
+      {
+        vcl_size_t offset = row + item_id * A.internal_size1();
+        value_type val = elements[offset];
+
+        if (val)
+          sum += p_buf[coords[offset]] * val;
+      }
+
+      //
+      // Part 2: Process HYB part
+      //
+      vcl_size_t col_begin = csr_row_buffer[row];
+      vcl_size_t col_end   = csr_row_buffer[row + 1];
+
+      for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
+        sum += p_buf[csr_col_buffer[item_id]] * csr_elements[item_id];
+
+      Ap_buf[row] = sum;
+      inner_prod_ApAp += sum * sum;
+      inner_prod_pAp  += val_p_diag * sum;
+      inner_prod_Ap_r0star += r0star ? sum * r0star[row] : value_type(0);
+    }
+
+    data_buffer[    buffer_chunk_size] = inner_prod_ApAp;
+    data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
+    if (r0star)
+      data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
+  }
+
+} // namespace detail
+
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for vectors 'result', 'p', 'r', 'Ap':
+  *   result += alpha * p;
+  *   r      -= alpha * Ap;
+  *   p       = r + beta * p;
+  * and runs the parallel reduction stage for computing inner_prod(r,r)
+  */
+template<typename NumericT>
+void pipelined_cg_vector_update(vector_base<NumericT> & result,
+                                NumericT alpha,
+                                vector_base<NumericT> & p,
+                                vector_base<NumericT> & r,
+                                vector_base<NumericT> const & Ap,
+                                NumericT beta,
+                                vector_base<NumericT> & inner_prod_buffer)
+{
+  typedef NumericT       value_type;
+
+  value_type       * data_result = detail::extract_raw_pointer<value_type>(result);
+  value_type       * data_p      = detail::extract_raw_pointer<value_type>(p);
+  value_type       * data_r      = detail::extract_raw_pointer<value_type>(r);
+  value_type const * data_Ap     = detail::extract_raw_pointer<value_type>(Ap);
+  value_type       * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+  // Note: Due to the special setting in CG, there is no need to check for sizes and strides
+  vcl_size_t size  = viennacl::traits::size(result);
+
+  value_type inner_prod_r = 0;
+  for (long i = 0; i < static_cast<long>(size); ++i)
+  {
+    value_type value_p = data_p[static_cast<vcl_size_t>(i)];
+    value_type value_r = data_r[static_cast<vcl_size_t>(i)];
+
+
+    data_result[static_cast<vcl_size_t>(i)] += alpha * value_p;
+    value_r -= alpha * data_Ap[static_cast<vcl_size_t>(i)];
+    value_p  = value_r + beta * value_p;
+    inner_prod_r += value_r * value_r;
+
+    data_p[static_cast<vcl_size_t>(i)] = value_p;
+    data_r[static_cast<vcl_size_t>(i)] = value_r;
+  }
+
+  data_buffer[0] = inner_prod_r;
+}
+
+
+/** @brief Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for a matrix A and vectors 'p' and 'Ap':
+  *   Ap = prod(A, p);
+  * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+  */
+template<typename NumericT>
+void pipelined_cg_prod(compressed_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  typedef NumericT const *    PtrType;
+  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
+}
+
+
+
+/** @brief Performs a fused matrix-vector product with a coordinate_matrix for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for a matrix A and vectors 'p' and 'Ap':
+  *   Ap = prod(A, p);
+  * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+  */
+template<typename NumericT>
+void pipelined_cg_prod(coordinate_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  typedef NumericT const *    PtrType;
+  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
+}
+
+
+/** @brief Performs a fused matrix-vector product with an ell_matrix for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for a matrix A and vectors 'p' and 'Ap':
+  *   Ap = prod(A, p);
+  * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+  */
+template<typename NumericT>
+void pipelined_cg_prod(ell_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  typedef NumericT const *    PtrType;
+  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
+}
+
+
+/** @brief Performs a fused matrix-vector product with an sliced_ell_matrix for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for a matrix A and vectors 'p' and 'Ap':
+  *   Ap = prod(A, p);
+  * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+  */
+template<typename NumericT, typename IndexT>
+void pipelined_cg_prod(sliced_ell_matrix<NumericT, IndexT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  typedef NumericT const *    PtrType;
+  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
+}
+
+
+
+
+/** @brief Performs a fused matrix-vector product with an hyb_matrix for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for a matrix A and vectors 'p' and 'Ap':
+  *   Ap = prod(A, p);
+  * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+  */
+template<typename NumericT>
+void pipelined_cg_prod(hyb_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  typedef NumericT const *    PtrType;
+  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
+}
+
+//////////////////////////
+
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm.
+  *
+  * This routines computes for vectors 's', 'r', 'Ap':
+  *   s = r - alpha * Ap
+  * with alpha obtained from a reduction step on the 0th and the 3rd out of 6 chunks in inner_prod_buffer
+  * and runs the parallel reduction stage for computing inner_prod(s,s)
+  */
+template<typename NumericT>
+void pipelined_bicgstab_update_s(vector_base<NumericT> & s,
+                                 vector_base<NumericT> & r,
+                                 vector_base<NumericT> const & Ap,
+                                 vector_base<NumericT> & inner_prod_buffer,
+                                 vcl_size_t buffer_chunk_size,
+                                 vcl_size_t buffer_chunk_offset)
+{
+  typedef NumericT      value_type;
+
+  value_type       * data_s      = detail::extract_raw_pointer<value_type>(s);
+  value_type       * data_r      = detail::extract_raw_pointer<value_type>(r);
+  value_type const * data_Ap     = detail::extract_raw_pointer<value_type>(Ap);
+  value_type       * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+  // Note: Due to the special setting in CG, there is no need to check for sizes and strides
+  vcl_size_t size  = viennacl::traits::size(s);
+
+  // part 1: compute alpha:
+  value_type r_in_r0 = 0;
+  value_type Ap_in_r0 = 0;
+  for (vcl_size_t i=0; i<buffer_chunk_size; ++i)
+  {
+     r_in_r0 += data_buffer[i];
+    Ap_in_r0 += data_buffer[i + 3 * buffer_chunk_size];
+  }
+  value_type alpha = r_in_r0 / Ap_in_r0;
+
+  // part 2: s = r - alpha * Ap  and first step in reduction for s:
+  value_type inner_prod_s = 0;
+  for (long i = 0; i < static_cast<long>(size); ++i)
+  {
+    value_type value_s  = data_s[static_cast<vcl_size_t>(i)];
+
+    value_s = data_r[static_cast<vcl_size_t>(i)] - alpha * data_Ap[static_cast<vcl_size_t>(i)];
+    inner_prod_s += value_s * value_s;
+
+    data_s[static_cast<vcl_size_t>(i)] = value_s;
+  }
+
+  data_buffer[buffer_chunk_offset] = inner_prod_s;
+}
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm.
+  *
+  * x_{j+1} = x_j + alpha * p_j + omega * s_j
+  * r_{j+1} = s_j - omega * t_j
+  * p_{j+1} = r_{j+1} + beta * (p_j - omega * q_j)
+  * and compute first stage of r_dot_r0 = <r_{j+1}, r_o^*> for use in next iteration
+  */
+ template<typename NumericT>
+ void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,
+                                       vector_base<NumericT> & residual, vector_base<NumericT> const & As,
+                                       NumericT beta, vector_base<NumericT> const & Ap,
+                                       vector_base<NumericT> const & r0star,
+                                       vector_base<NumericT>       & inner_prod_buffer,
+                                       vcl_size_t buffer_chunk_size)
+ {
+   typedef NumericT    value_type;
+
+   value_type       * data_result   = detail::extract_raw_pointer<value_type>(result);
+   value_type       * data_p        = detail::extract_raw_pointer<value_type>(p);
+   value_type const * data_s        = detail::extract_raw_pointer<value_type>(s);
+   value_type       * data_residual = detail::extract_raw_pointer<value_type>(residual);
+   value_type const * data_As       = detail::extract_raw_pointer<value_type>(As);
+   value_type const * data_Ap       = detail::extract_raw_pointer<value_type>(Ap);
+   value_type const * data_r0star   = detail::extract_raw_pointer<value_type>(r0star);
+   value_type       * data_buffer   = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+   vcl_size_t size = viennacl::traits::size(result);
+
+   value_type inner_prod_r_r0star = 0;
+   for (long i = 0; i < static_cast<long>(size); ++i)
+   {
+     vcl_size_t index = static_cast<vcl_size_t>(i);
+     value_type value_result   = data_result[index];
+     value_type value_p        = data_p[index];
+     value_type value_s        = data_s[index];
+     value_type value_residual = data_residual[index];
+     value_type value_As       = data_As[index];
+     value_type value_Ap       = data_Ap[index];
+     value_type value_r0star   = data_r0star[index];
+
+     value_result   += alpha * value_p + omega * value_s;
+     value_residual  = value_s - omega * value_As;
+     value_p         = value_residual + beta * (value_p - omega * value_Ap);
+     inner_prod_r_r0star += value_residual * value_r0star;
+
+     data_result[index]   = value_result;
+     data_residual[index] = value_residual;
+     data_p[index]        = value_p;
+   }
+
+   (void)buffer_chunk_size; // not needed here, just silence compiler warning (unused variable)
+   data_buffer[0] = inner_prod_r_r0star;
+ }
+
+ /** @brief Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined BiCGStab algorithm.
+   *
+   * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+   *   Ap = prod(A, p);
+   * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+   */
+ template<typename NumericT>
+ void pipelined_bicgstab_prod(compressed_matrix<NumericT> const & A,
+                              vector_base<NumericT> const & p,
+                              vector_base<NumericT> & Ap,
+                              vector_base<NumericT> const & r0star,
+                              vector_base<NumericT> & inner_prod_buffer,
+                              vcl_size_t buffer_chunk_size,
+                              vcl_size_t buffer_chunk_offset)
+ {
+   NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);
+
+   viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ }
+
+ /** @brief Performs a fused matrix-vector product with a coordinate_matrix for an efficient pipelined BiCGStab algorithm.
+   *
+   * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+   *   Ap = prod(A, p);
+   * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+   */
+ template<typename NumericT>
+ void pipelined_bicgstab_prod(coordinate_matrix<NumericT> const & A,
+                              vector_base<NumericT> const & p,
+                              vector_base<NumericT> & Ap,
+                              vector_base<NumericT> const & r0star,
+                              vector_base<NumericT> & inner_prod_buffer,
+                              vcl_size_t buffer_chunk_size,
+                              vcl_size_t buffer_chunk_offset)
+ {
+   NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);
+
+   viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ }
+
+ /** @brief Performs a fused matrix-vector product with an ell_matrix for an efficient pipelined BiCGStab algorithm.
+   *
+   * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+   *   Ap = prod(A, p);
+   * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+   */
+ template<typename NumericT>
+ void pipelined_bicgstab_prod(ell_matrix<NumericT> const & A,
+                              vector_base<NumericT> const & p,
+                              vector_base<NumericT> & Ap,
+                              vector_base<NumericT> const & r0star,
+                              vector_base<NumericT> & inner_prod_buffer,
+                              vcl_size_t buffer_chunk_size,
+                              vcl_size_t buffer_chunk_offset)
+ {
+   NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);
+
+   viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ }
+
+ /** @brief Performs a fused matrix-vector product with a sliced_ell_matrix for an efficient pipelined BiCGStab algorithm.
+   *
+   * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+   *   Ap = prod(A, p);
+   * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+   */
+ template<typename NumericT, typename IndexT>
+ void pipelined_bicgstab_prod(sliced_ell_matrix<NumericT, IndexT> const & A,
+                              vector_base<NumericT> const & p,
+                              vector_base<NumericT> & Ap,
+                              vector_base<NumericT> const & r0star,
+                              vector_base<NumericT> & inner_prod_buffer,
+                              vcl_size_t buffer_chunk_size,
+                              vcl_size_t buffer_chunk_offset)
+ {
+   NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);
+
+   viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ }
+
+ /** @brief Performs a fused matrix-vector product with a hyb_matrix for an efficient pipelined BiCGStab algorithm.
+   *
+   * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+   *   Ap = prod(A, p);
+   * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+   */
+ template<typename NumericT>
+ void pipelined_bicgstab_prod(hyb_matrix<NumericT> const & A,
+                              vector_base<NumericT> const & p,
+                              vector_base<NumericT> & Ap,
+                              vector_base<NumericT> const & r0star,
+                              vector_base<NumericT> & inner_prod_buffer,
+                              vcl_size_t buffer_chunk_size,
+                              vcl_size_t buffer_chunk_offset)
+ {
+   NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);
+
+   viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ }
+
+
+/////////////////////////////////////////////////////////////
+
+/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
+ *
+ * This routines computes for vectors 'r', 'v_k':
+ *   Second reduction step for ||v_k||
+ *   v_k /= ||v_k||
+ *   First reduction step for <r, v_k>
+ */
+template <typename T>
+void pipelined_gmres_normalize_vk(vector_base<T> & v_k,
+                                 vector_base<T> const & residual,
+                                 vector_base<T> & R_buffer,
+                                 vcl_size_t offset_in_R,
+                                 vector_base<T> const & inner_prod_buffer,
+                                 vector_base<T> & r_dot_vk_buffer,
+                                 vcl_size_t buffer_chunk_size,
+                                 vcl_size_t buffer_chunk_offset)
+{
+  typedef T        value_type;
+
+  value_type       * data_v_k      = detail::extract_raw_pointer<value_type>(v_k);
+  value_type const * data_residual = detail::extract_raw_pointer<value_type>(residual);
+  value_type       * data_R        = detail::extract_raw_pointer<value_type>(R_buffer);
+  value_type const * data_buffer   = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+  value_type       * data_r_dot_vk = detail::extract_raw_pointer<value_type>(r_dot_vk_buffer);
+
+  // Note: Due to the special setting in GMRES, there is no need to check for sizes and strides
+  vcl_size_t size     = viennacl::traits::size(v_k);
+  vcl_size_t vk_start = viennacl::traits::start(v_k);
+
+  // part 1: compute alpha:
+  value_type norm_vk = 0;
+  for (vcl_size_t i=0; i<buffer_chunk_size; ++i)
+   norm_vk += data_buffer[i + buffer_chunk_size];
+  norm_vk = std::sqrt(norm_vk);
+  data_R[offset_in_R] = norm_vk;
+
+  // Compute <r, v_k> after normalization of v_k:
+  value_type inner_prod_r_dot_vk = 0;
+  for (long i = 0; i < static_cast<long>(size); ++i)
+  {
+    value_type value_vk = data_v_k[static_cast<vcl_size_t>(i) + vk_start] / norm_vk;
+
+    inner_prod_r_dot_vk += data_residual[static_cast<vcl_size_t>(i)] * value_vk;
+
+    data_v_k[static_cast<vcl_size_t>(i) + vk_start] = value_vk;
+  }
+
+  data_r_dot_vk[buffer_chunk_offset] = inner_prod_r_dot_vk;
+}
+
+
+
+/** @brief Computes first reduction stage for multiple inner products <v_i, v_k>, i=0..k-1
+ *
+ *  All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size'
+ */
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
+                                        vcl_size_t v_k_size,
+                                        vcl_size_t v_k_internal_size,
+                                        vcl_size_t k,
+                                        vector_base<T> & vi_in_vk_buffer,
+                                        vcl_size_t buffer_chunk_size)
+{
+  typedef T        value_type;
+
+  value_type const * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
+  value_type       * data_inner_prod   = detail::extract_raw_pointer<value_type>(vi_in_vk_buffer);
+
+  // reset buffer:
+  for (vcl_size_t j = 0; j < k; ++j)
+    data_inner_prod[j*buffer_chunk_size] = value_type(0);
+
+  // compute inner products:
+  for (vcl_size_t i = 0; i < v_k_size; ++i)
+  {
+    value_type value_vk = data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size];
+
+    for (vcl_size_t j = 0; j < k; ++j)
+      data_inner_prod[j*buffer_chunk_size] += data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size] * value_vk;
+  }
+}
+
+
+/** @brief Computes the second reduction stage for multiple inner products <v_i, v_k>, i=0..k-1, then updates v_k -= <v_i, v_k> v_i and computes the first reduction stage for ||v_k||
+ *
+ *  All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size'
+ */
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis,
+                                        vcl_size_t v_k_size,
+                                        vcl_size_t v_k_internal_size,
+                                        vcl_size_t k,
+                                        vector_base<T> const & vi_in_vk_buffer,
+                                        vector_base<T> & R_buffer,
+                                        vcl_size_t krylov_dim,
+                                        vector_base<T> & inner_prod_buffer,
+                                        vcl_size_t buffer_chunk_size)
+{
+  typedef T        value_type;
+
+  value_type * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
+
+  std::vector<T> values_vi_in_vk(k);
+
+  // Step 1: Finish reduction of <v_i, v_k> to obtain scalars:
+  for (std::size_t i=0; i<k; ++i)
+    for (vcl_size_t j=0; j<buffer_chunk_size; ++j)
+      values_vi_in_vk[i] += vi_in_vk_buffer[i*buffer_chunk_size + j];
+
+
+  // Step 2: Compute v_k -= <v_i, v_k> v_i and reduction on ||v_k||:
+  value_type norm_vk = 0;
+  for (vcl_size_t i = 0; i < v_k_size; ++i)
+  {
+    value_type value_vk = data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size];
+
+    for (vcl_size_t j = 0; j < k; ++j)
+      value_vk -= values_vi_in_vk[j] * data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size];
+
+    norm_vk += value_vk * value_vk;
+    data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size] = value_vk;
+  }
+
+  // Step 3: Write values to R_buffer:
+  for (std::size_t i=0; i<k; ++i)
+    R_buffer[i + k * krylov_dim] = values_vi_in_vk[i];
+
+  inner_prod_buffer[buffer_chunk_size] = norm_vk;
+}
+
+/** @brief Computes x += eta_0 r + sum_{i=1}^{k-1} eta_i v_{i-1} */
+template <typename T>
+void pipelined_gmres_update_result(vector_base<T> & result,
+                                  vector_base<T> const & residual,
+                                  vector_base<T> const & krylov_basis,
+                                  vcl_size_t v_k_size,
+                                  vcl_size_t v_k_internal_size,
+                                  vector_base<T> const & coefficients,
+                                  vcl_size_t k)
+{
+  typedef T        value_type;
+
+  value_type       * data_result       = detail::extract_raw_pointer<value_type>(result);
+  value_type const * data_residual     = detail::extract_raw_pointer<value_type>(residual);
+  value_type const * data_krylov_basis = detail::extract_raw_pointer<value_type>(krylov_basis);
+  value_type const * data_coefficients = detail::extract_raw_pointer<value_type>(coefficients);
+
+  for (vcl_size_t i = 0; i < v_k_size; ++i)
+  {
+    value_type value_result = data_result[i];
+
+    value_result += data_coefficients[0] * data_residual[i];
+    for (vcl_size_t j = 1; j<k; ++j)
+      value_result += data_coefficients[j] * data_krylov_basis[i + (j-1) * v_k_internal_size];
+
+    data_result[i] = value_result;
+  }
+
+}
+
+// Reuse implementation from CG:
+template <typename MatrixType, typename T>
+void pipelined_gmres_prod(MatrixType const & A,
+                      vector_base<T> const & p,
+                      vector_base<T> & Ap,
+                      vector_base<T> & inner_prod_buffer)
+{
+  pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+}
+
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif


[26/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/common.hpp
new file mode 100644
index 0000000..93b0cba
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/common.hpp
@@ -0,0 +1,263 @@
+#ifndef VIENNACL_LINALG_DETAIL_ILU_COMMON_HPP_
+#define VIENNACL_LINALG_DETAIL_ILU_COMMON_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/common.hpp
+    @brief Common routines used within ILU-type preconditioners
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include <map>
+#include <list>
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/backend/memory.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/misc_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+
+//
+// Level Scheduling Setup for ILU:
+//
+
+template<typename NumericT, unsigned int AlignmentV>
+void level_scheduling_setup_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & LU,
+                                 viennacl::vector<NumericT> const & diagonal_LU,
+                                 std::list<viennacl::backend::mem_handle> & row_index_arrays,
+                                 std::list<viennacl::backend::mem_handle> & row_buffers,
+                                 std::list<viennacl::backend::mem_handle> & col_buffers,
+                                 std::list<viennacl::backend::mem_handle> & element_buffers,
+                                 std::list<vcl_size_t> & row_elimination_num_list,
+                                 bool setup_U)
+{
+  NumericT     const * diagonal_buf = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(diagonal_LU.handle());
+  NumericT     const * elements     = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(LU.handle());
+  unsigned int const * row_buffer   = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle1());
+  unsigned int const * col_buffer   = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle2());
+
+  //
+  // Step 1: Determine row elimination order for each row and build up meta information about the number of entries taking part in each elimination step:
+  //
+  std::vector<vcl_size_t> row_elimination(LU.size1());
+  std::map<vcl_size_t, std::map<vcl_size_t, vcl_size_t> > row_entries_per_elimination_step;
+
+  vcl_size_t max_elimination_runs = 0;
+  for (vcl_size_t row2 = 0; row2 < LU.size1(); ++row2)
+  {
+    vcl_size_t row = setup_U ? (LU.size1() - row2) - 1 : row2;
+
+    vcl_size_t row_begin = row_buffer[row];
+    vcl_size_t row_end   = row_buffer[row+1];
+    vcl_size_t elimination_index = 0;  //Note: first run corresponds to elimination_index = 1 (otherwise, type issues with int <-> unsigned int would arise
+    for (vcl_size_t i = row_begin; i < row_end; ++i)
+    {
+      unsigned int col = col_buffer[i];
+      if ( (!setup_U && col < row) || (setup_U && col > row) )
+      {
+        elimination_index = std::max<vcl_size_t>(elimination_index, row_elimination[col]);
+        row_entries_per_elimination_step[row_elimination[col]][row] += 1;
+      }
+    }
+    row_elimination[row] = elimination_index + 1;
+    max_elimination_runs = std::max<vcl_size_t>(max_elimination_runs, elimination_index + 1);
+  }
+
+  //std::cout << "Number of elimination runs: " << max_elimination_runs << std::endl;
+
+  //
+  // Step 2: Build row-major elimination matrix for each elimination step
+  //
+
+  //std::cout << "Elimination order: " << std::endl;
+  //for (vcl_size_t i=0; i<row_elimination.size(); ++i)
+  //  std::cout << row_elimination[i] << ", ";
+  //std::cout << std::endl;
+
+  //vcl_size_t summed_rows = 0;
+  for (vcl_size_t elimination_run = 1; elimination_run <= max_elimination_runs; ++elimination_run)
+  {
+    std::map<vcl_size_t, vcl_size_t> const & current_elimination_info = row_entries_per_elimination_step[elimination_run];
+
+    // count cols and entries handled in this elimination step
+    vcl_size_t num_tainted_cols = current_elimination_info.size();
+    vcl_size_t num_entries = 0;
+
+    for (std::map<vcl_size_t, vcl_size_t>::const_iterator it  = current_elimination_info.begin();
+                                                          it != current_elimination_info.end();
+                                                        ++it)
+      num_entries += it->second;
+
+    //std::cout << "num_entries: " << num_entries << std::endl;
+    //std::cout << "num_tainted_cols: " << num_tainted_cols << std::endl;
+
+    if (num_tainted_cols > 0)
+    {
+      row_index_arrays.push_back(viennacl::backend::mem_handle());
+      viennacl::backend::switch_memory_context<unsigned int>(row_index_arrays.back(), viennacl::traits::context(LU));
+      viennacl::backend::typesafe_host_array<unsigned int> elim_row_index_array(row_index_arrays.back(), num_tainted_cols);
+
+      row_buffers.push_back(viennacl::backend::mem_handle());
+      viennacl::backend::switch_memory_context<unsigned int>(row_buffers.back(), viennacl::traits::context(LU));
+      viennacl::backend::typesafe_host_array<unsigned int> elim_row_buffer(row_buffers.back(), num_tainted_cols + 1);
+
+      col_buffers.push_back(viennacl::backend::mem_handle());
+      viennacl::backend::switch_memory_context<unsigned int>(col_buffers.back(), viennacl::traits::context(LU));
+      viennacl::backend::typesafe_host_array<unsigned int> elim_col_buffer(col_buffers.back(), num_entries);
+
+      element_buffers.push_back(viennacl::backend::mem_handle());
+      viennacl::backend::switch_memory_context<NumericT>(element_buffers.back(), viennacl::traits::context(LU));
+      std::vector<NumericT> elim_elements_buffer(num_entries);
+
+      row_elimination_num_list.push_back(num_tainted_cols);
+
+      vcl_size_t k=0;
+      vcl_size_t nnz_index = 0;
+      elim_row_buffer.set(0, 0);
+
+      for (std::map<vcl_size_t, vcl_size_t>::const_iterator it  = current_elimination_info.begin();
+                                                              it != current_elimination_info.end();
+                                                            ++it)
+      {
+        //vcl_size_t col = setup_U ? (elimination_matrix.size() - it->first) - 1 : col2;
+        vcl_size_t row = it->first;
+        elim_row_index_array.set(k, row);
+
+        vcl_size_t row_begin = row_buffer[row];
+        vcl_size_t row_end   = row_buffer[row+1];
+        for (vcl_size_t i = row_begin; i < row_end; ++i)
+        {
+          unsigned int col = col_buffer[i];
+          if ( (!setup_U && col < row) || (setup_U && col > row) ) //entry of L/U
+          {
+            if (row_elimination[col] == elimination_run) // this entry is substituted in this run
+            {
+              elim_col_buffer.set(nnz_index, col);
+              elim_elements_buffer[nnz_index] = setup_U ? elements[i] / diagonal_buf[it->first] : elements[i];
+              ++nnz_index;
+            }
+          }
+        }
+
+        elim_row_buffer.set(++k, nnz_index);
+      }
+
+      //
+      // Wrap in memory_handles:
+      //
+      viennacl::backend::memory_create(row_index_arrays.back(), elim_row_index_array.raw_size(),                viennacl::traits::context(row_index_arrays.back()), elim_row_index_array.get());
+      viennacl::backend::memory_create(row_buffers.back(),      elim_row_buffer.raw_size(),                     viennacl::traits::context(row_buffers.back()),      elim_row_buffer.get());
+      viennacl::backend::memory_create(col_buffers.back(),      elim_col_buffer.raw_size(),                     viennacl::traits::context(col_buffers.back()),      elim_col_buffer.get());
+      viennacl::backend::memory_create(element_buffers.back(),  sizeof(NumericT) * elim_elements_buffer.size(), viennacl::traits::context(element_buffers.back()),  &(elim_elements_buffer[0]));
+    }
+
+    // Print some info:
+    //std::cout << "Eliminated columns in run " << elimination_run << ": " << num_tainted_cols << " (tainted columns: " << num_tainted_cols << ")" << std::endl;
+    //summed_rows += eliminated_rows_in_run;
+    //if (eliminated_rows_in_run == 0)
+    //  break;
+  }
+  //std::cout << "Eliminated rows: " << summed_rows << " out of " << row_elimination.size() << std::endl;
+}
+
+
+template<typename NumericT, unsigned int AlignmentV>
+void level_scheduling_setup_L(viennacl::compressed_matrix<NumericT, AlignmentV> const & LU,
+                              viennacl::vector<NumericT> const & diagonal_LU,
+                              std::list<viennacl::backend::mem_handle> & row_index_arrays,
+                              std::list<viennacl::backend::mem_handle> & row_buffers,
+                              std::list<viennacl::backend::mem_handle> & col_buffers,
+                              std::list<viennacl::backend::mem_handle> & element_buffers,
+                              std::list<vcl_size_t> & row_elimination_num_list)
+{
+  level_scheduling_setup_impl(LU, diagonal_LU, row_index_arrays, row_buffers, col_buffers, element_buffers, row_elimination_num_list, false);
+}
+
+
+//
+// Multifrontal setup of U:
+//
+
+template<typename NumericT, unsigned int AlignmentV>
+void level_scheduling_setup_U(viennacl::compressed_matrix<NumericT, AlignmentV> const & LU,
+                              viennacl::vector<NumericT> const & diagonal_LU,
+                              std::list<viennacl::backend::mem_handle> & row_index_arrays,
+                              std::list<viennacl::backend::mem_handle> & row_buffers,
+                              std::list<viennacl::backend::mem_handle> & col_buffers,
+                              std::list<viennacl::backend::mem_handle> & element_buffers,
+                              std::list<vcl_size_t> & row_elimination_num_list)
+{
+  level_scheduling_setup_impl(LU, diagonal_LU, row_index_arrays, row_buffers, col_buffers, element_buffers, row_elimination_num_list, true);
+}
+
+
+//
+// Multifrontal substitution (both L and U). Will partly be moved to single_threaded/opencl/cuda implementations
+//
+template<typename NumericT>
+void level_scheduling_substitute(viennacl::vector<NumericT> & vec,
+                                 std::list<viennacl::backend::mem_handle> const & row_index_arrays,
+                                 std::list<viennacl::backend::mem_handle> const & row_buffers,
+                                 std::list<viennacl::backend::mem_handle> const & col_buffers,
+                                 std::list<viennacl::backend::mem_handle> const & element_buffers,
+                                 std::list<vcl_size_t> const & row_elimination_num_list)
+{
+  typedef typename std::list< viennacl::backend::mem_handle >::const_iterator  ListIterator;
+  ListIterator row_index_array_it = row_index_arrays.begin();
+  ListIterator row_buffers_it = row_buffers.begin();
+  ListIterator col_buffers_it = col_buffers.begin();
+  ListIterator element_buffers_it = element_buffers.begin();
+  typename std::list< vcl_size_t>::const_iterator row_elimination_num_it = row_elimination_num_list.begin();
+  for (vcl_size_t i=0; i<row_index_arrays.size(); ++i)
+  {
+    viennacl::linalg::detail::level_scheduling_substitute(vec, *row_index_array_it, *row_buffers_it, *col_buffers_it, *element_buffers_it, *row_elimination_num_it);
+
+    ++row_index_array_it;
+    ++row_buffers_it;
+    ++col_buffers_it;
+    ++element_buffers_it;
+    ++row_elimination_num_it;
+  }
+}
+
+
+
+
+
+} // namespace detail
+} // namespace linalg
+} // namespace viennacl
+
+
+
+
+#endif
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilu0.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilu0.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilu0.hpp
new file mode 100644
index 0000000..1c3191a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilu0.hpp
@@ -0,0 +1,379 @@
+#ifndef VIENNACL_LINALG_DETAIL_ILU0_HPP_
+#define VIENNACL_LINALG_DETAIL_ILU0_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/ilu0.hpp
+  @brief Implementations of incomplete factorization preconditioners with static nonzero pattern.
+
+  Contributed by Evan Bollig.
+
+  ILU0 (Incomplete LU with zero fill-in)
+  - All preconditioner nonzeros exist at locations that were nonzero in the input matrix.
+  - The number of nonzeros in the output preconditioner are exactly the same number as the input matrix
+
+ Evan Bollig 3/30/12
+
+ Adapted from viennacl/linalg/detail/ilut.hpp
+
+ Low-level reimplementation by Karl Rupp in Nov 2012, increasing performance substantially. Also added level-scheduling.
+
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/backend/memory.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for incomplete LU factorization with static pattern (ILU0)
+*/
+class ilu0_tag
+{
+public:
+  ilu0_tag(bool with_level_scheduling = false) : use_level_scheduling_(with_level_scheduling) {}
+
+  bool use_level_scheduling() const { return use_level_scheduling_; }
+  void use_level_scheduling(bool b) { use_level_scheduling_ = b; }
+
+private:
+  bool use_level_scheduling_;
+};
+
+
+/** @brief Implementation of a ILU-preconditioner with static pattern. Optimized version for CSR matrices.
+  *
+  * refer to the Algorithm in Saad's book (1996 edition)
+  *
+  *  @param A       The sparse matrix matrix. The result is directly written to A.
+  */
+template<typename NumericT>
+void precondition(viennacl::compressed_matrix<NumericT> & A, ilu0_tag const & /* tag */)
+{
+  assert( (A.handle1().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+  assert( (A.handle2().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+  assert( (A.handle().get_active_handle_id()  == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+
+  NumericT           * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  // Note: Line numbers in the following refer to the algorithm in Saad's book
+
+  for (vcl_size_t i=1; i<A.size1(); ++i)  // Line 1
+  {
+    unsigned int row_i_begin = row_buffer[i];
+    unsigned int row_i_end   = row_buffer[i+1];
+    for (unsigned int buf_index_k = row_i_begin; buf_index_k < row_i_end; ++buf_index_k) //Note: We do not assume that the column indices within a row are sorted
+    {
+      unsigned int k = col_buffer[buf_index_k];
+      if (k >= i)
+        continue; //Note: We do not assume that the column indices within a row are sorted
+
+      unsigned int row_k_begin = row_buffer[k];
+      unsigned int row_k_end   = row_buffer[k+1];
+
+      // get a_kk:
+      NumericT a_kk = 0;
+      for (unsigned int buf_index_akk = row_k_begin; buf_index_akk < row_k_end; ++buf_index_akk)
+      {
+        if (col_buffer[buf_index_akk] == k)
+        {
+          a_kk = elements[buf_index_akk];
+          break;
+        }
+      }
+
+      NumericT & a_ik = elements[buf_index_k];
+      a_ik /= a_kk;                                 //Line 3
+
+      for (unsigned int buf_index_j = row_i_begin; buf_index_j < row_i_end; ++buf_index_j) //Note: We do not assume that the column indices within a row are sorted
+      {
+        unsigned int j = col_buffer[buf_index_j];
+        if (j <= k)
+          continue;
+
+        // determine a_kj:
+        NumericT a_kj = 0;
+        for (unsigned int buf_index_akj = row_k_begin; buf_index_akj < row_k_end; ++buf_index_akj)
+        {
+          if (col_buffer[buf_index_akj] == j)
+          {
+            a_kj = elements[buf_index_akj];
+            break;
+          }
+        }
+
+        //a_ij -= a_ik * a_kj
+        elements[buf_index_j] -= a_ik * a_kj;  //Line 5
+      }
+    }
+  }
+
+}
+
+
+/** @brief ILU0 preconditioner class, can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class ilu0_precond
+{
+  typedef typename MatrixT::value_type      NumericType;
+
+public:
+  ilu0_precond(MatrixT const & mat, ilu0_tag const & tag) : tag_(tag), LU_()
+  {
+    //initialize preconditioner:
+    //std::cout << "Start CPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End CPU precond" << std::endl;
+  }
+
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU_.handle1());
+    unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU_.handle2());
+    NumericType  const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(LU_.handle());
+
+    viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LU_.size2(), unit_lower_tag());
+    viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LU_.size2(), upper_tag());
+  }
+
+private:
+  void init(MatrixT const & mat)
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    viennacl::switch_memory_context(LU_, host_context);
+
+    viennacl::copy(mat, LU_);
+    viennacl::linalg::precondition(LU_, tag_);
+  }
+
+  ilu0_tag                                   tag_;
+  viennacl::compressed_matrix<NumericType>   LU_;
+};
+
+
+/** @brief ILU0 preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class ilu0_precond< viennacl::compressed_matrix<NumericT, AlignmentV> >
+{
+  typedef viennacl::compressed_matrix<NumericT, AlignmentV>   MatrixType;
+
+public:
+  ilu0_precond(MatrixType const & mat, ilu0_tag const & tag)
+    : tag_(tag),
+      LU_(mat.size1(), mat.size2(), viennacl::traits::context(mat))
+  {
+    //initialize preconditioner:
+    //std::cout << "Start GPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End GPU precond" << std::endl;
+  }
+
+  void apply(viennacl::vector<NumericT> & vec) const
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    if (vec.handle().get_active_handle_id() != viennacl::MAIN_MEMORY)
+    {
+      if (tag_.use_level_scheduling())
+      {
+        //std::cout << "Using multifrontal on GPU..." << std::endl;
+        detail::level_scheduling_substitute(vec,
+                                            multifrontal_L_row_index_arrays_,
+                                            multifrontal_L_row_buffers_,
+                                            multifrontal_L_col_buffers_,
+                                            multifrontal_L_element_buffers_,
+                                            multifrontal_L_row_elimination_num_list_);
+
+        vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+
+        detail::level_scheduling_substitute(vec,
+                                            multifrontal_U_row_index_arrays_,
+                                            multifrontal_U_row_buffers_,
+                                            multifrontal_U_col_buffers_,
+                                            multifrontal_U_element_buffers_,
+                                            multifrontal_U_row_elimination_num_list_);
+      }
+      else
+      {
+        viennacl::context old_context = viennacl::traits::context(vec);
+        viennacl::switch_memory_context(vec, host_context);
+        viennacl::linalg::inplace_solve(LU_, vec, unit_lower_tag());
+        viennacl::linalg::inplace_solve(LU_, vec, upper_tag());
+        viennacl::switch_memory_context(vec, old_context);
+      }
+    }
+    else //apply ILU0 directly on CPU
+    {
+      if (tag_.use_level_scheduling())
+      {
+        //std::cout << "Using multifrontal..." << std::endl;
+        detail::level_scheduling_substitute(vec,
+                                            multifrontal_L_row_index_arrays_,
+                                            multifrontal_L_row_buffers_,
+                                            multifrontal_L_col_buffers_,
+                                            multifrontal_L_element_buffers_,
+                                            multifrontal_L_row_elimination_num_list_);
+
+        vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+
+        detail::level_scheduling_substitute(vec,
+                                            multifrontal_U_row_index_arrays_,
+                                            multifrontal_U_row_buffers_,
+                                            multifrontal_U_col_buffers_,
+                                            multifrontal_U_element_buffers_,
+                                            multifrontal_U_row_elimination_num_list_);
+      }
+      else
+      {
+        viennacl::linalg::inplace_solve(LU_, vec, unit_lower_tag());
+        viennacl::linalg::inplace_solve(LU_, vec, upper_tag());
+      }
+    }
+  }
+
+  vcl_size_t levels() const { return multifrontal_L_row_index_arrays_.size(); }
+
+private:
+  void init(MatrixType const & mat)
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    viennacl::switch_memory_context(LU_, host_context);
+    LU_ = mat;
+    viennacl::linalg::precondition(LU_, tag_);
+
+    if (!tag_.use_level_scheduling())
+      return;
+
+    // multifrontal part:
+    viennacl::switch_memory_context(multifrontal_U_diagonal_, host_context);
+    multifrontal_U_diagonal_.resize(LU_.size1(), false);
+    host_based::detail::row_info(LU_, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+    detail::level_scheduling_setup_L(LU_,
+                                     multifrontal_U_diagonal_, //dummy
+                                     multifrontal_L_row_index_arrays_,
+                                     multifrontal_L_row_buffers_,
+                                     multifrontal_L_col_buffers_,
+                                     multifrontal_L_element_buffers_,
+                                     multifrontal_L_row_elimination_num_list_);
+
+
+    detail::level_scheduling_setup_U(LU_,
+                                     multifrontal_U_diagonal_,
+                                     multifrontal_U_row_index_arrays_,
+                                     multifrontal_U_row_buffers_,
+                                     multifrontal_U_col_buffers_,
+                                     multifrontal_U_element_buffers_,
+                                     multifrontal_U_row_elimination_num_list_);
+
+    //
+    // Bring to device if necessary:
+    //
+
+    // L:
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_index_arrays_.begin();
+                                                                       it != multifrontal_L_row_index_arrays_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_buffers_.begin();
+                                                                       it != multifrontal_L_row_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_col_buffers_.begin();
+                                                                       it != multifrontal_L_col_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_element_buffers_.begin();
+                                                                       it != multifrontal_L_element_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
+
+
+    // U:
+
+    viennacl::switch_memory_context(multifrontal_U_diagonal_, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_index_arrays_.begin();
+                                                                       it != multifrontal_U_row_index_arrays_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_buffers_.begin();
+                                                                       it != multifrontal_U_row_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_col_buffers_.begin();
+                                                                       it != multifrontal_U_col_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_element_buffers_.begin();
+                                                                       it != multifrontal_U_element_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
+
+  }
+
+  ilu0_tag tag_;
+  viennacl::compressed_matrix<NumericT> LU_;
+
+  std::list<viennacl::backend::mem_handle> multifrontal_L_row_index_arrays_;
+  std::list<viennacl::backend::mem_handle> multifrontal_L_row_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_L_col_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_L_element_buffers_;
+  std::list<vcl_size_t>                    multifrontal_L_row_elimination_num_list_;
+
+  viennacl::vector<NumericT> multifrontal_U_diagonal_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_row_index_arrays_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_row_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_col_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_element_buffers_;
+  std::list<vcl_size_t>                    multifrontal_U_row_elimination_num_list_;
+
+};
+
+} // namespace linalg
+} // namespace viennacl
+
+
+#endif
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilut.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilut.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilut.hpp
new file mode 100644
index 0000000..11ab842
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilut.hpp
@@ -0,0 +1,597 @@
+#ifndef VIENNACL_LINALG_DETAIL_ILUT_HPP_
+#define VIENNACL_LINALG_DETAIL_ILUT_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/ilut.hpp
+    @brief Implementations of an incomplete factorization preconditioner with threshold (ILUT)
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for incomplete LU factorization with threshold (ILUT)
+*/
+class ilut_tag
+{
+  public:
+    /** @brief The constructor.
+    *
+    * @param entries_per_row        Number of nonzero entries per row in L and U. Note that L and U are stored in a single matrix, thus there are 2*entries_per_row in total.
+    * @param drop_tolerance         The drop tolerance for ILUT
+    * @param with_level_scheduling  Flag for enabling level scheduling on GPUs.
+    */
+    ilut_tag(unsigned int entries_per_row = 20,
+             double       drop_tolerance = 1e-4,
+             bool         with_level_scheduling = false)
+      : entries_per_row_(entries_per_row),
+        drop_tolerance_(drop_tolerance),
+        use_level_scheduling_(with_level_scheduling) {}
+
+    void set_drop_tolerance(double tol)
+    {
+      if (tol > 0)
+        drop_tolerance_ = tol;
+    }
+    double get_drop_tolerance() const { return drop_tolerance_; }
+
+    void set_entries_per_row(unsigned int e)
+    {
+      if (e > 0)
+        entries_per_row_ = e;
+    }
+
+    unsigned int get_entries_per_row() const { return entries_per_row_; }
+
+    bool use_level_scheduling() const { return use_level_scheduling_; }
+    void use_level_scheduling(bool b) { use_level_scheduling_ = b; }
+
+  private:
+    unsigned int entries_per_row_;
+    double       drop_tolerance_;
+    bool         use_level_scheduling_;
+};
+
+
+namespace detail
+{
+  /** @brief Helper struct for holding a sparse vector in linear memory. For internal use only.
+    *
+    * Unfortunately, the 'naive' implementation using a std::map<> is almost always too slow.
+    *
+    */
+  template<typename NumericT>
+  struct ilut_sparse_vector
+  {
+    ilut_sparse_vector(vcl_size_t alloc_size = 0) : size_(0), col_indices_(alloc_size), elements_(alloc_size) {}
+
+    void resize_if_bigger(vcl_size_t s)
+    {
+      if (s > elements_.size())
+      {
+        col_indices_.resize(s);
+        elements_.resize(s);
+      }
+      size_ = s;
+    }
+
+    vcl_size_t size_;
+    std::vector<unsigned int> col_indices_;
+    std::vector<NumericT>     elements_;
+  };
+
+  /** @brief Subtracts a scaled sparse vector u from a sparse vector w and writes the output to z: z = w - alpha * u
+    *
+    * Sparsity pattern of u and w are usually different.
+    *
+    * @return Length of new vector
+    */
+  template<typename IndexT, typename NumericT>
+  IndexT merge_subtract_sparse_rows(IndexT const * w_coords, NumericT const * w_elements, IndexT w_size,
+                                    IndexT const * u_coords, NumericT const * u_elements, IndexT u_size, NumericT alpha,
+                                    IndexT       * z_coords, NumericT       * z_elements)
+  {
+    IndexT index_w = 0;
+    IndexT index_u = 0;
+    IndexT index_z = 0;
+
+    while (1)
+    {
+      if (index_w < w_size && index_u < u_size)
+      {
+        if (w_coords[index_w] < u_coords[index_u])
+        {
+          z_coords[index_z]     = w_coords[index_w];
+          z_elements[index_z++] = w_elements[index_w++];
+        }
+        else if (w_coords[index_w] == u_coords[index_u])
+        {
+          z_coords[index_z]     = w_coords[index_w];
+          z_elements[index_z++] = w_elements[index_w++] - alpha * u_elements[index_u++];
+        }
+        else
+        {
+          z_coords[index_z]     = u_coords[index_u];
+          z_elements[index_z++] = - alpha * u_elements[index_u++];
+        }
+      }
+      else if (index_w == w_size && index_u < u_size)
+      {
+        z_coords[index_z]     = u_coords[index_u];
+        z_elements[index_z++] = - alpha * u_elements[index_u++];
+      }
+      else if (index_w < w_size && index_u == u_size)
+      {
+        z_coords[index_z]     = w_coords[index_w];
+        z_elements[index_z++] = w_elements[index_w++];
+      }
+      else
+        return index_z;
+    }
+  }
+
+  template<typename SizeT, typename NumericT>
+  void insert_with_value_sort(std::vector<std::pair<SizeT, NumericT> > & map,
+                              SizeT index, NumericT value)
+  {
+    NumericT abs_value = std::fabs(value);
+    if (abs_value > 0)
+    {
+      // find first element with smaller absolute value:
+      std::size_t first_smaller_index = 0;
+      while (first_smaller_index < map.size() && std::fabs(map[first_smaller_index].second) > abs_value)
+        ++first_smaller_index;
+
+      std::pair<SizeT, NumericT> tmp(index, value);
+      for (std::size_t j=first_smaller_index; j<map.size(); ++j)
+        std::swap(map[j], tmp);
+    }
+  }
+
+}
+
+/** @brief Implementation of a ILU-preconditioner with threshold. Optimized implementation for compressed_matrix.
+*
+* refer to Algorithm 10.6 by Saad's book (1996 edition)
+*
+*  @param A       The input matrix. Either a compressed_matrix or of type std::vector< std::map<T, U> >
+*  @param L       The output matrix for L.
+*  @param U       The output matrix for U.
+*  @param tag     An ilut_tag in order to dispatch among several other preconditioners.
+*/
+template<typename NumericT>
+void precondition(viennacl::compressed_matrix<NumericT> const & A,
+                  viennacl::compressed_matrix<NumericT>       & L,
+                  viennacl::compressed_matrix<NumericT>       & U,
+                  ilut_tag const & tag)
+{
+  assert(A.size1() == L.size1() && bool("Output matrix size mismatch") );
+  assert(A.size1() == U.size1() && bool("Output matrix size mismatch") );
+
+  L.reserve( tag.get_entries_per_row()      * A.size1());
+  U.reserve((tag.get_entries_per_row() + 1) * A.size1());
+
+  vcl_size_t avg_nnz_per_row = static_cast<vcl_size_t>(A.nnz() / A.size1());
+  detail::ilut_sparse_vector<NumericT> w1(tag.get_entries_per_row() * (avg_nnz_per_row + 10));
+  detail::ilut_sparse_vector<NumericT> w2(tag.get_entries_per_row() * (avg_nnz_per_row + 10));
+  detail::ilut_sparse_vector<NumericT> * w_in  = &w1;
+  detail::ilut_sparse_vector<NumericT> * w_out = &w2;
+  std::vector<NumericT> diagonal_U(A.size1());
+
+  NumericT     const * elements_A   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * row_buffer_A = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * col_buffer_A = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  NumericT           * elements_L   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(L.handle());
+  unsigned int       * row_buffer_L = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.handle1()); row_buffer_L[0] = 0;
+  unsigned int       * col_buffer_L = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.handle2());
+
+  NumericT           * elements_U   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(U.handle());
+  unsigned int       * row_buffer_U = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.handle1()); row_buffer_U[0] = 0;
+  unsigned int       * col_buffer_U = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.handle2());
+
+  std::vector<std::pair<unsigned int, NumericT> > sorted_entries_L(tag.get_entries_per_row());
+  std::vector<std::pair<unsigned int, NumericT> > sorted_entries_U(tag.get_entries_per_row());
+
+  for (vcl_size_t i=0; i<viennacl::traits::size1(A); ++i)  // Line 1
+  {
+    std::fill(sorted_entries_L.begin(), sorted_entries_L.end(), std::pair<unsigned int, NumericT>(0, NumericT(0)));
+    std::fill(sorted_entries_U.begin(), sorted_entries_U.end(), std::pair<unsigned int, NumericT>(0, NumericT(0)));
+
+    //line 2: set up w
+    w_in->resize_if_bigger(row_buffer_A[i+1] - row_buffer_A[i]);
+    NumericT row_norm = 0;
+    unsigned int k = 0;
+    for (unsigned int j = row_buffer_A[i]; j < row_buffer_A[i+1]; ++j, ++k)
+    {
+      w_in->col_indices_[k] = col_buffer_A[j];
+      NumericT entry = elements_A[j];
+      w_in->elements_[k] = entry;
+      row_norm += entry * entry;
+    }
+    row_norm = std::sqrt(row_norm);
+    NumericT tau_i = static_cast<NumericT>(tag.get_drop_tolerance()) * row_norm;
+
+    //line 3: Iterate over lower diagonal parts of A:
+    k = 0;
+    unsigned int current_col = (row_buffer_A[i+1] > row_buffer_A[i]) ? w_in->col_indices_[k] : static_cast<unsigned int>(i); // mind empty rows here!
+    while (current_col < i)
+    {
+      //line 4:
+      NumericT a_kk = diagonal_U[current_col];
+
+      NumericT w_k_entry = w_in->elements_[k] / a_kk;
+      w_in->elements_[k] = w_k_entry;
+
+      //lines 5,6: (dropping rule to w_k)
+      if ( std::fabs(w_k_entry) > tau_i)
+      {
+        //line 7:
+        unsigned int row_U_begin = row_buffer_U[current_col];
+        unsigned int row_U_end   = row_buffer_U[current_col + 1];
+
+        if (row_U_end > row_U_begin)
+        {
+          w_out->resize_if_bigger(w_in->size_ + (row_U_end - row_U_begin) - 1);
+          w_out->size_ = detail::merge_subtract_sparse_rows(&(w_in->col_indices_[0]), &(w_in->elements_[0]), static_cast<unsigned int>(w_in->size_),
+                                                            col_buffer_U + row_U_begin + 1, elements_U + row_U_begin + 1, (row_U_end - row_U_begin) - 1, w_k_entry,
+                                                            &(w_out->col_indices_[0]), &(w_out->elements_[0])
+                                                           );
+          ++k;
+        }
+      }
+      else // drop element
+      {
+        w_out->resize_if_bigger(w_in->size_ - 1);
+        for (unsigned int r = 0; r < k; ++r)
+        {
+          w_out->col_indices_[r] = w_in->col_indices_[r];
+          w_out->elements_[r]    = w_in->elements_[r];
+        }
+        for (unsigned int r = k+1; r < w_in->size_; ++r)
+        {
+          w_out->col_indices_[r-1] = w_in->col_indices_[r];
+          w_out->elements_[r-1]    = w_in->elements_[r];
+        }
+
+        // Note: No increment to k here, element was dropped!
+      }
+
+      // swap pointers to w1 and w2
+      std::swap(w_in, w_out);
+
+      // process next entry:
+      current_col = (k < w_in->size_) ? w_in->col_indices_[k] : static_cast<unsigned int>(i);
+    } // while()
+
+    // Line 10: Apply a dropping rule to w
+    // To do so, we write values to a temporary array
+    for (unsigned int r = 0; r < w_in->size_; ++r)
+    {
+      unsigned int col   = w_in->col_indices_[r];
+      NumericT     value = w_in->elements_[r];
+
+      if (col < i) // entry for L:
+        detail::insert_with_value_sort(sorted_entries_L, col, value);
+      else if (col == i) // do not drop diagonal element
+      {
+        diagonal_U[i] = value;
+        if (value <= 0 && value >= 0)
+        {
+          std::cerr << "ViennaCL: FATAL ERROR in ILUT(): Diagonal entry computed to zero (" << value << ") in row " << i << "!" << std::endl;
+          throw zero_on_diagonal_exception("ILUT zero diagonal!");
+        }
+      }
+      else // entry for U:
+        detail::insert_with_value_sort(sorted_entries_U, col, value);
+    }
+
+    //Lines 10-12: Apply a dropping rule to w, write the largest p values to L and U
+    unsigned int offset_L = row_buffer_L[i];
+    std::sort(sorted_entries_L.begin(), sorted_entries_L.end());
+    for (unsigned int j=0; j<tag.get_entries_per_row(); ++j)
+      if (std::fabs(sorted_entries_L[j].second) > 0)
+      {
+        col_buffer_L[offset_L] = sorted_entries_L[j].first;
+        elements_L[offset_L]   = sorted_entries_L[j].second;
+        ++offset_L;
+      }
+    row_buffer_L[i+1] = offset_L;
+
+    unsigned int offset_U = row_buffer_U[i];
+    col_buffer_U[offset_U] = static_cast<unsigned int>(i);
+    elements_U[offset_U]   = diagonal_U[i];
+    ++offset_U;
+    std::sort(sorted_entries_U.begin(), sorted_entries_U.end());
+    for (unsigned int j=0; j<tag.get_entries_per_row(); ++j)
+      if (std::fabs(sorted_entries_U[j].second) > 0)
+      {
+        col_buffer_U[offset_U] = sorted_entries_U[j].first;
+        elements_U[offset_U]   = sorted_entries_U[j].second;
+        ++offset_U;
+      }
+    row_buffer_U[i+1] = offset_U;
+
+  } //for i
+}
+
+
+/** @brief ILUT preconditioner class, can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class ilut_precond
+{
+  typedef typename MatrixT::value_type      NumericType;
+
+public:
+  ilut_precond(MatrixT const & mat, ilut_tag const & tag) : tag_(tag), L_(mat.size1(), mat.size2()), U_(mat.size1(), mat.size2())
+  {
+    //initialize preconditioner:
+    //std::cout << "Start CPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End CPU precond" << std::endl;
+  }
+
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    //Note: Since vec can be a rather arbitrary vector type, we call the more generic version in the backend manually:
+    {
+      unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_.handle1());
+      unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_.handle2());
+      NumericType  const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(L_.handle());
+
+      viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, L_.size2(), unit_lower_tag());
+    }
+    {
+      unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_.handle1());
+      unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_.handle2());
+      NumericType  const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(U_.handle());
+
+      viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, U_.size2(), upper_tag());
+    }
+  }
+
+private:
+  void init(MatrixT const & mat)
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    viennacl::compressed_matrix<NumericType> temp;
+    viennacl::switch_memory_context(temp, host_context);
+    viennacl::switch_memory_context(L_, host_context);
+    viennacl::switch_memory_context(U_, host_context);
+
+    viennacl::copy(mat, temp);
+
+    viennacl::linalg::precondition(temp, L_, U_, tag_);
+  }
+
+  ilut_tag tag_;
+  viennacl::compressed_matrix<NumericType> L_;
+  viennacl::compressed_matrix<NumericType> U_;
+};
+
+
+/** @brief ILUT preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class ilut_precond< viennacl::compressed_matrix<NumericT, AlignmentV> >
+{
+typedef viennacl::compressed_matrix<NumericT, AlignmentV>   MatrixType;
+
+public:
+  ilut_precond(MatrixType const & mat, ilut_tag const & tag)
+    : tag_(tag),
+      L_(mat.size1(), mat.size2(), viennacl::traits::context(mat)),
+      U_(mat.size1(), mat.size2(), viennacl::traits::context(mat))
+  {
+    //initialize preconditioner:
+    //std::cout << "Start GPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End GPU precond" << std::endl;
+  }
+
+  void apply(viennacl::vector<NumericT> & vec) const
+  {
+    if (vec.handle().get_active_handle_id() != viennacl::MAIN_MEMORY)
+    {
+      if (tag_.use_level_scheduling())
+      {
+        //std::cout << "Using multifrontal on GPU..." << std::endl;
+        detail::level_scheduling_substitute(vec,
+                                            multifrontal_L_row_index_arrays_,
+                                            multifrontal_L_row_buffers_,
+                                            multifrontal_L_col_buffers_,
+                                            multifrontal_L_element_buffers_,
+                                            multifrontal_L_row_elimination_num_list_);
+
+        vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+
+        detail::level_scheduling_substitute(vec,
+                                            multifrontal_U_row_index_arrays_,
+                                            multifrontal_U_row_buffers_,
+                                            multifrontal_U_col_buffers_,
+                                            multifrontal_U_element_buffers_,
+                                            multifrontal_U_row_elimination_num_list_);
+
+      }
+      else
+      {
+        viennacl::context host_context(viennacl::MAIN_MEMORY);
+        viennacl::context old_context = viennacl::traits::context(vec);
+        viennacl::switch_memory_context(vec, host_context);
+        viennacl::linalg::inplace_solve(L_, vec, unit_lower_tag());
+        viennacl::linalg::inplace_solve(U_, vec, upper_tag());
+        viennacl::switch_memory_context(vec, old_context);
+      }
+    }
+    else //apply ILUT directly:
+    {
+      viennacl::linalg::inplace_solve(L_, vec, unit_lower_tag());
+      viennacl::linalg::inplace_solve(U_, vec, upper_tag());
+    }
+  }
+
+private:
+  void init(MatrixType const & mat)
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    viennacl::switch_memory_context(L_, host_context);
+    viennacl::switch_memory_context(U_, host_context);
+
+    if (viennacl::traits::context(mat).memory_type() == viennacl::MAIN_MEMORY)
+    {
+      viennacl::linalg::precondition(mat, L_, U_, tag_);
+    }
+    else //we need to copy to CPU
+    {
+      viennacl::compressed_matrix<NumericT> cpu_mat(mat.size1(), mat.size2(), viennacl::traits::context(mat));
+      viennacl::switch_memory_context(cpu_mat, host_context);
+
+      cpu_mat = mat;
+
+      viennacl::linalg::precondition(cpu_mat, L_, U_, tag_);
+    }
+
+    if (!tag_.use_level_scheduling())
+      return;
+
+    //
+    // multifrontal part:
+    //
+
+    viennacl::switch_memory_context(multifrontal_U_diagonal_, host_context);
+    multifrontal_U_diagonal_.resize(U_.size1(), false);
+    host_based::detail::row_info(U_, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+    detail::level_scheduling_setup_L(L_,
+                                     multifrontal_U_diagonal_, //dummy
+                                     multifrontal_L_row_index_arrays_,
+                                     multifrontal_L_row_buffers_,
+                                     multifrontal_L_col_buffers_,
+                                     multifrontal_L_element_buffers_,
+                                     multifrontal_L_row_elimination_num_list_);
+
+
+    detail::level_scheduling_setup_U(U_,
+                                     multifrontal_U_diagonal_,
+                                     multifrontal_U_row_index_arrays_,
+                                     multifrontal_U_row_buffers_,
+                                     multifrontal_U_col_buffers_,
+                                     multifrontal_U_element_buffers_,
+                                     multifrontal_U_row_elimination_num_list_);
+
+    //
+    // Bring to device if necessary:
+    //
+
+    // L:
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_index_arrays_.begin();
+                                                                       it != multifrontal_L_row_index_arrays_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_buffers_.begin();
+                                                                       it != multifrontal_L_row_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_col_buffers_.begin();
+                                                                       it != multifrontal_L_col_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_element_buffers_.begin();
+                                                                       it != multifrontal_L_element_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
+
+
+    // U:
+
+    viennacl::switch_memory_context(multifrontal_U_diagonal_, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_index_arrays_.begin();
+                                                                       it != multifrontal_U_row_index_arrays_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_buffers_.begin();
+                                                                       it != multifrontal_U_row_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_col_buffers_.begin();
+                                                                       it != multifrontal_U_col_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_element_buffers_.begin();
+                                                                       it != multifrontal_U_element_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
+
+
+  }
+
+  ilut_tag tag_;
+  viennacl::compressed_matrix<NumericT> L_;
+  viennacl::compressed_matrix<NumericT> U_;
+
+  std::list<viennacl::backend::mem_handle> multifrontal_L_row_index_arrays_;
+  std::list<viennacl::backend::mem_handle> multifrontal_L_row_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_L_col_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_L_element_buffers_;
+  std::list<vcl_size_t > multifrontal_L_row_elimination_num_list_;
+
+  viennacl::vector<NumericT> multifrontal_U_diagonal_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_row_index_arrays_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_row_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_col_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_element_buffers_;
+  std::list<vcl_size_t > multifrontal_U_row_elimination_num_list_;
+};
+
+} // namespace linalg
+} // namespace viennacl
+
+
+
+
+#endif
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_applier.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_applier.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_applier.hpp
new file mode 100644
index 0000000..0e2abb0
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_applier.hpp
@@ -0,0 +1,103 @@
+#ifndef VIENNACL_LINALG_DETAIL_OP_APPLIER_HPP
+#define VIENNACL_LINALG_DETAIL_OP_APPLIER_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/op_applier.hpp
+ *
+ * @brief Defines the action of certain unary and binary operators and its arguments (for host execution).
+*/
+
+#include "viennacl/forwards.h"
+#include <cmath>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+/** @brief Worker class for decomposing expression templates.
+  *
+  * @tparam A    Type to which is assigned to
+  * @tparam OP   One out of {op_assign, op_inplace_add, op_inplace_sub}
+  @ @tparam T    Right hand side of the assignment
+*/
+template<typename OpT>
+struct op_applier
+{
+  typedef typename OpT::ERROR_UNKNOWN_OP_TAG_PROVIDED    error_type;
+};
+
+/** \cond */
+template<>
+struct op_applier<op_element_binary<op_prod> >
+{
+  template<typename T>
+  static void apply(T & result, T const & x, T const & y) { result = x * y; }
+};
+
+template<>
+struct op_applier<op_element_binary<op_div> >
+{
+  template<typename T>
+  static void apply(T & result, T const & x, T const & y) { result = x / y; }
+};
+
+template<>
+struct op_applier<op_element_binary<op_pow> >
+{
+  template<typename T>
+  static void apply(T & result, T const & x, T const & y) { result = std::pow(x, y); }
+};
+
+#define VIENNACL_MAKE_UNARY_OP_APPLIER(funcname)  \
+template<> \
+struct op_applier<op_element_unary<op_##funcname> > \
+{ \
+  template<typename T> \
+  static void apply(T & result, T const & x) { using namespace std; result = funcname(x); } \
+}
+
+VIENNACL_MAKE_UNARY_OP_APPLIER(abs);
+VIENNACL_MAKE_UNARY_OP_APPLIER(acos);
+VIENNACL_MAKE_UNARY_OP_APPLIER(asin);
+VIENNACL_MAKE_UNARY_OP_APPLIER(atan);
+VIENNACL_MAKE_UNARY_OP_APPLIER(ceil);
+VIENNACL_MAKE_UNARY_OP_APPLIER(cos);
+VIENNACL_MAKE_UNARY_OP_APPLIER(cosh);
+VIENNACL_MAKE_UNARY_OP_APPLIER(exp);
+VIENNACL_MAKE_UNARY_OP_APPLIER(fabs);
+VIENNACL_MAKE_UNARY_OP_APPLIER(floor);
+VIENNACL_MAKE_UNARY_OP_APPLIER(log);
+VIENNACL_MAKE_UNARY_OP_APPLIER(log10);
+VIENNACL_MAKE_UNARY_OP_APPLIER(sin);
+VIENNACL_MAKE_UNARY_OP_APPLIER(sinh);
+VIENNACL_MAKE_UNARY_OP_APPLIER(sqrt);
+VIENNACL_MAKE_UNARY_OP_APPLIER(tan);
+VIENNACL_MAKE_UNARY_OP_APPLIER(tanh);
+
+#undef VIENNACL_MAKE_UNARY_OP_APPLIER
+/** \endcond */
+
+}
+}
+}
+
+#endif // VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_executor.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_executor.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_executor.hpp
new file mode 100644
index 0000000..bd49b3b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_executor.hpp
@@ -0,0 +1,86 @@
+#ifndef VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP
+#define VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/op_executor.hpp
+ *
+ * @brief Defines the worker class for decomposing an expression tree into small chunks, which can be processed by the predefined operations in ViennaCL.
+*/
+
+#include "viennacl/forwards.h"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+template<typename NumericT, typename B>
+bool op_aliasing(vector_base<NumericT> const & /*lhs*/, B const & /*b*/)
+{
+  return false;
+}
+
+template<typename NumericT>
+bool op_aliasing(vector_base<NumericT> const & lhs, vector_base<NumericT> const & b)
+{
+  return lhs.handle() == b.handle();
+}
+
+template<typename NumericT, typename LhsT, typename RhsT, typename OpT>
+bool op_aliasing(vector_base<NumericT> const & lhs, vector_expression<const LhsT, const RhsT, OpT> const & rhs)
+{
+  return op_aliasing(lhs, rhs.lhs()) || op_aliasing(lhs, rhs.rhs());
+}
+
+
+template<typename NumericT, typename B>
+bool op_aliasing(matrix_base<NumericT> const & /*lhs*/, B const & /*b*/)
+{
+  return false;
+}
+
+template<typename NumericT>
+bool op_aliasing(matrix_base<NumericT> const & lhs, matrix_base<NumericT> const & b)
+{
+  return lhs.handle() == b.handle();
+}
+
+template<typename NumericT, typename LhsT, typename RhsT, typename OpT>
+bool op_aliasing(matrix_base<NumericT> const & lhs, matrix_expression<const LhsT, const RhsT, OpT> const & rhs)
+{
+  return op_aliasing(lhs, rhs.lhs()) || op_aliasing(lhs, rhs.rhs());
+}
+
+
+/** @brief Worker class for decomposing expression templates.
+  *
+  * @tparam A    Type to which is assigned to
+  * @tparam OP   One out of {op_assign, op_inplace_add, op_inplace_sub}
+  @ @tparam T    Right hand side of the assignment
+*/
+template<typename A, typename OP, typename T>
+struct op_executor {};
+
+}
+}
+}
+
+#endif // VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_matrix.hpp
new file mode 100644
index 0000000..12ff77b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_matrix.hpp
@@ -0,0 +1,86 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_BLOCK_MATRIX_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_BLOCK_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/tools/tools.hpp"
+
+/** @file viennacl/linalg/detail/spai/block_matrix.hpp
+    @brief Implementation of a bunch of (small) matrices on GPU. Experimental.
+
+    SPAI code contributed by Nikolay Lukash
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/**
+* @brief Represents contigious matrices on GPU
+*/
+
+class block_matrix
+{
+public:
+
+  ////////// non-const
+
+  /** @brief Returns a handle to the elements */
+  viennacl::ocl::handle<cl_mem>& handle(){ return elements_; }
+
+  /** @brief Returns a handle to the matrix dimensions */
+  viennacl::ocl::handle<cl_mem>& handle1() { return matrix_dimensions_; }
+
+  /** @brief Returns a handle to the start indices of matrix */
+  viennacl::ocl::handle<cl_mem>& handle2() { return start_block_inds_; }
+
+  ////////// const
+
+  /** @brief Returns a handle to the const elements */
+  const viennacl::ocl::handle<cl_mem>& handle() const { return elements_; }
+
+  /** @brief Returns a handle to the const matrix dimensions */
+  const viennacl::ocl::handle<cl_mem>& handle1() const { return matrix_dimensions_; }
+
+  /** @brief Returns a handle to the const start indices of matrix */
+  const viennacl::ocl::handle<cl_mem>& handle2() const { return start_block_inds_; }
+
+private:
+  viennacl::ocl::handle<cl_mem> elements_;
+  viennacl::ocl::handle<cl_mem> matrix_dimensions_;
+  viennacl::ocl::handle<cl_mem> start_block_inds_;
+};
+
+
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_vector.hpp
new file mode 100644
index 0000000..eee6aef
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_vector.hpp
@@ -0,0 +1,77 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_BLOCK_VECTOR_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_BLOCK_VECTOR_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/tools/tools.hpp"
+
+/** @file viennacl/linalg/detail/spai/block_vector.hpp
+    @brief Implementation of a bunch of vectors on GPU. Experimental.
+
+    SPAI code contributed by Nikolay Lukash
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/**
+* @brief Represents a contiguous vector on the GPU to represent a concatentation of small vectors
+*/
+class block_vector
+{
+public:
+
+  ///////////// non-const
+
+  /** @brief Return handle to the elements */
+  viennacl::ocl::handle<cl_mem> & handle(){ return elements_; }
+
+  /** @brief Return handle to start indices */
+  viennacl::ocl::handle<cl_mem> & handle1() { return start_block_inds_; }
+
+  ///////////// const
+
+  /** @brief Return handle to the const elements */
+  const viennacl::ocl::handle<cl_mem> & handle() const { return elements_; }
+
+  /** @brief Return handle to const start indices */
+  const viennacl::ocl::handle<cl_mem> & handle1() const { return start_block_inds_; }
+
+private:
+  viennacl::ocl::handle<cl_mem> elements_;
+  viennacl::ocl::handle<cl_mem> start_block_inds_;
+};
+
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/fspai.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/fspai.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/fspai.hpp
new file mode 100644
index 0000000..fab81d7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/fspai.hpp
@@ -0,0 +1,402 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_FSPAI_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_FSPAI_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+
+//boost includes
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/lu.hpp"
+#include "boost/numeric/ublas/triangular.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+
+// ViennaCL includes
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+//#include <omp.h>
+
+/** @file viennacl/linalg/detail/spai/fspai.hpp
+    @brief Implementation of FSPAI. Experimental.
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/** @brief A tag for FSPAI. Experimental.
+*
+* Contains values for the algorithm.
+* Must be passed to spai_precond constructor
+*/
+class fspai_tag
+{
+public:
+  /** @brief Constructor
+   *
+   * @param residual_norm_threshold Calculate until the norm of the residual falls below this threshold
+   * @param iteration_limit maximum number of iterations
+   * @param is_static determines if static version of SPAI should be used
+   * @param is_right determines if left or right preconditioner should be used
+   */
+  fspai_tag(
+          double residual_norm_threshold = 1e-3,
+          unsigned int iteration_limit = 5,
+          bool is_static = false,
+          bool is_right = false)
+    : residual_norm_threshold_(residual_norm_threshold),
+      iteration_limit_(iteration_limit),
+      is_static_(is_static),
+      is_right_(is_right) {}
+
+  inline double getResidualNormThreshold() const { return residual_norm_threshold_; }
+  inline unsigned long getIterationLimit () const { return iteration_limit_; }
+  inline bool getIsStatic() const { return is_static_; }
+  inline bool getIsRight() const  { return is_right_; }
+  inline void setResidualNormThreshold(double residual_norm_threshold)
+  {
+    if (residual_norm_threshold > 0)
+      residual_norm_threshold_ = residual_norm_threshold;
+  }
+  inline void setIterationLimit(unsigned long iteration_limit)
+  {
+    if (iteration_limit > 0)
+      iteration_limit_ = iteration_limit;
+  }
+  inline void setIsRight(bool is_right)   { is_right_  = is_right; }
+  inline void setIsStatic(bool is_static) { is_static_ = is_static; }
+
+private:
+  double residual_norm_threshold_;
+  unsigned long iteration_limit_;
+  bool is_static_;
+  bool is_right_;
+};
+
+
+//
+// Helper: Store A in an STL container of type, exploiting symmetry
+// Reason: ublas interface does not allow to iterate over nonzeros of a particular row without starting an iterator1 from the very beginning of the matrix...
+//
+template<typename MatrixT, typename NumericT>
+void sym_sparse_matrix_to_stl(MatrixT const & A, std::vector<std::map<unsigned int, NumericT> > & STL_A)
+{
+  STL_A.resize(A.size1());
+  for (typename MatrixT::const_iterator1 row_it  = A.begin1();
+                                         row_it != A.end1();
+                                       ++row_it)
+  {
+    for (typename MatrixT::const_iterator2 col_it  = row_it.begin();
+                                           col_it != row_it.end();
+                                         ++col_it)
+    {
+      if (col_it.index1() >= col_it.index2())
+        STL_A[col_it.index1()][static_cast<unsigned int>(col_it.index2())] = *col_it;
+      else
+        break; //go to next row
+    }
+  }
+}
+
+
+//
+// Generate index sets J_k, k=0,...,N-1
+//
+template<typename MatrixT>
+void generateJ(MatrixT const & A, std::vector<std::vector<vcl_size_t> > & J)
+{
+  for (typename MatrixT::const_iterator1 row_it  = A.begin1();
+                                         row_it != A.end1();
+                                       ++row_it)
+  {
+    for (typename MatrixT::const_iterator2 col_it  = row_it.begin();
+                                           col_it != row_it.end();
+                                         ++col_it)
+    {
+      if (col_it.index1() > col_it.index2()) //Matrix is symmetric, thus only work on lower triangular part
+      {
+        J[col_it.index2()].push_back(col_it.index1());
+        J[col_it.index1()].push_back(col_it.index2());
+      }
+      else
+        break; //go to next row
+    }
+  }
+}
+
+
+//
+// Extracts the blocks A(\tilde{J}_k, \tilde{J}_k) from A
+// Sets up y_k = A(\tilde{J}_k, k) for the inplace-solution after Cholesky-factoriation
+//
+template<typename NumericT, typename MatrixT, typename VectorT>
+void fill_blocks(std::vector< std::map<unsigned int, NumericT> > & A,
+                 std::vector<MatrixT>                            & blocks,
+                 std::vector<std::vector<vcl_size_t> > const     & J,
+                 std::vector<VectorT>                            & Y)
+{
+  for (vcl_size_t k=0; k<A.size(); ++k)
+  {
+    std::vector<vcl_size_t> const & Jk = J[k];
+    VectorT & yk = Y[k];
+    MatrixT & block_k = blocks[k];
+
+    yk.resize(Jk.size());
+    block_k.resize(Jk.size(), Jk.size());
+    block_k.clear();
+
+    for (vcl_size_t i=0; i<Jk.size(); ++i)
+    {
+      vcl_size_t row_index = Jk[i];
+      std::map<unsigned int, NumericT> & A_row = A[row_index];
+
+      //fill y_k:
+      yk[i] = A_row[static_cast<unsigned int>(k)];
+
+      for (vcl_size_t j=0; j<Jk.size(); ++j)
+      {
+        vcl_size_t col_index = Jk[j];
+        if (col_index <= row_index && A_row.find(static_cast<unsigned int>(col_index)) != A_row.end()) //block is symmetric, thus store only lower triangular part
+          block_k(i, j) = A_row[static_cast<unsigned int>(col_index)];
+      }
+    }
+  }
+}
+
+
+//
+// Perform Cholesky factorization of A inplace. Cf. Schwarz: Numerische Mathematik, vol 5, p. 58
+//
+template<typename MatrixT>
+void cholesky_decompose(MatrixT & A)
+{
+  for (vcl_size_t k=0; k<A.size2(); ++k)
+  {
+    if (A(k,k) <= 0)
+    {
+      std::cout << "k: " << k << std::endl;
+      std::cout << "A(k,k): " << A(k,k) << std::endl;
+    }
+
+    assert(A(k,k) > 0 && bool("Matrix not positive definite in Cholesky factorization."));
+
+    A(k,k) = std::sqrt(A(k,k));
+
+    for (vcl_size_t i=k+1; i<A.size1(); ++i)
+    {
+      A(i,k) /= A(k,k);
+      for (vcl_size_t j=k+1; j<=i; ++j)
+        A(i,j) -= A(i,k) * A(j,k);
+    }
+  }
+}
+
+
+//
+// Compute x in Ax = b, where A is already Cholesky factored (A = L L^T)
+//
+template<typename MatrixT, typename VectorT>
+void cholesky_solve(MatrixT const & L, VectorT & b)
+{
+  // inplace forward solve L x = b
+  for (vcl_size_t i=0; i<L.size1(); ++i)
+  {
+    for (vcl_size_t j=0; j<i; ++j)
+      b[i] -= L(i,j) * b[j];
+    b[i] /= L(i,i);
+  }
+
+  // inplace backward solve L^T x = b:
+  for (vcl_size_t i=L.size1()-1;; --i)
+  {
+    for (vcl_size_t k=i+1; k<L.size1(); ++k)
+      b[i] -= L(k,i) * b[k];
+    b[i] /= L(i,i);
+
+    if (i==0) //vcl_size_t might be unsigned, therefore manual check for equality with zero here
+      break;
+  }
+}
+
+
+
+//
+// Compute the Cholesky factor L from the sparse vectors y_k
+//
+template<typename MatrixT, typename VectorT>
+void computeL(MatrixT const & A,
+              MatrixT       & L,
+              MatrixT       & L_trans,
+              std::vector<VectorT> & Y,
+              std::vector<std::vector<vcl_size_t> > & J)
+{
+  typedef typename VectorT::value_type                          NumericType;
+  typedef std::vector<std::map<unsigned int, NumericType> >     STLSparseMatrixType;
+
+  STLSparseMatrixType L_temp(A.size1());
+
+  for (vcl_size_t k=0; k<A.size1(); ++k)
+  {
+    std::vector<vcl_size_t> const & Jk = J[k];
+    VectorT const & yk = Y[k];
+
+    //compute L(k,k):
+    NumericType Lkk = A(k,k);
+    for (vcl_size_t i=0; i<Jk.size(); ++i)
+      Lkk -= A(Jk[i],k) * yk[i];
+
+    Lkk = NumericType(1) / std::sqrt(Lkk);
+    L_temp[k][static_cast<unsigned int>(k)] = Lkk;
+    L_trans(k,k) = Lkk;
+
+    //write lower diagonal entries:
+    for (vcl_size_t i=0; i<Jk.size(); ++i)
+    {
+      L_temp[Jk[i]][static_cast<unsigned int>(k)] = -Lkk * yk[i];
+      L_trans(k, Jk[i]) = -Lkk * yk[i];
+    }
+  } //for k
+
+
+  //build L from L_temp
+  for (vcl_size_t i=0; i<L_temp.size(); ++i)
+    for (typename std::map<unsigned int, NumericType>::const_iterator it = L_temp[i].begin();
+           it != L_temp[i].end();
+         ++it)
+      L(i, it->first) = it->second;
+}
+
+
+//
+// Top level FSPAI function
+//
+template<typename MatrixT>
+void computeFSPAI(MatrixT const & A,
+                  MatrixT const & PatternA,
+                  MatrixT       & L,
+                  MatrixT       & L_trans,
+                  fspai_tag)
+{
+  typedef typename MatrixT::value_type                    NumericT;
+  typedef boost::numeric::ublas::matrix<NumericT>         DenseMatrixType;
+  typedef std::vector<std::map<unsigned int, NumericT> >  SparseMatrixType;
+
+  //
+  // preprocessing: Store A in a STL container:
+  //
+  //std::cout << "Transferring to STL container:" << std::endl;
+  std::vector<std::vector<NumericT> >    y_k(A.size1());
+  SparseMatrixType   STL_A(A.size1());
+  sym_sparse_matrix_to_stl(A, STL_A);
+
+
+  //
+  // Step 1: Generate pattern indices
+  //
+  //std::cout << "computeFSPAI(): Generating pattern..." << std::endl;
+  std::vector<std::vector<vcl_size_t> > J(A.size1());
+  generateJ(PatternA, J);
+
+  //
+  // Step 2: Set up matrix blocks
+  //
+  //std::cout << "computeFSPAI(): Setting up matrix blocks..." << std::endl;
+  std::vector<DenseMatrixType>  subblocks_A(A.size1());
+  fill_blocks(STL_A, subblocks_A, J, y_k);
+  STL_A.clear(); //not needed anymore
+
+  //
+  // Step 3: Cholesky-factor blocks
+  //
+  //std::cout << "computeFSPAI(): Cholesky-factorization..." << std::endl;
+  for (vcl_size_t i=0; i<subblocks_A.size(); ++i)
+  {
+    //std::cout << "Block before: " << subblocks_A[i] << std::endl;
+    cholesky_decompose(subblocks_A[i]);
+    //std::cout << "Block after: " << subblocks_A[i] << std::endl;
+  }
+
+
+  /*vcl_size_t num_bytes = 0;
+  for (vcl_size_t i=0; i<subblocks_A.size(); ++i)
+    num_bytes += 8*subblocks_A[i].size1()*subblocks_A[i].size2();*/
+  //std::cout << "Memory for FSPAI matrix: " << num_bytes / (1024.0 * 1024.0) << " MB" << std::endl;
+
+  //
+  // Step 4: Solve for y_k
+  //
+  //std::cout << "computeFSPAI(): Cholesky-solve..." << std::endl;
+  for (vcl_size_t i=0; i<y_k.size(); ++i)
+  {
+    if (subblocks_A[i].size1() > 0) //block might be empty...
+    {
+      //y_k[i].resize(subblocks_A[i].size1());
+      //std::cout << "y_k[" << i << "]: ";
+      //for (vcl_size_t j=0; j<y_k[i].size(); ++j)
+      //  std::cout << y_k[i][j] << " ";
+      //std::cout << std::endl;
+      cholesky_solve(subblocks_A[i], y_k[i]);
+    }
+  }
+
+
+  //
+  // Step 5: Set up Cholesky factors L and L_trans
+  //
+  //std::cout << "computeFSPAI(): Computing L..." << std::endl;
+  L.resize(A.size1(), A.size2(), false);
+  L.reserve(A.nnz(), false);
+  L_trans.resize(A.size1(), A.size2(), false);
+  L_trans.reserve(A.nnz(), false);
+  computeL(A, L, L_trans, y_k, J);
+
+  //std::cout << "L: " << L << std::endl;
+}
+
+
+
+}
+}
+}
+}
+
+#endif


[20/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp
new file mode 100644
index 0000000..f23223f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp
@@ -0,0 +1,2052 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/host_based/matrix_operations.hpp
+    @brief Implementations of dense matrix related operations, including matrix-vector products, using a plain single-threaded or OpenMP-enabled execution on CPU.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/detail/op_applier.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// Minimum Matrix size(size1*size2) for using OpenMP on matrix operations:
+#ifndef VIENNACL_OPENMP_MATRIX_MIN_SIZE
+  #define VIENNACL_OPENMP_MATRIX_MIN_SIZE  5000
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(matrix_base<DestNumericT> & mat1, matrix_base<SrcNumericT> const & mat2)
+{
+  assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+  DestNumericT      * data_A = detail::extract_raw_pointer<DestNumericT>(mat1);
+  SrcNumericT const * data_B = detail::extract_raw_pointer<SrcNumericT>(mat2);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat1);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat1);
+  vcl_size_t A_size1  = viennacl::traits::size1(mat1);
+  vcl_size_t A_size2  = viennacl::traits::size2(mat1);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat1);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat1);
+
+  vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+  vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+  vcl_size_t B_inc1   = viennacl::traits::stride1(mat2);
+  vcl_size_t B_inc2   = viennacl::traits::stride2(mat2);
+  vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(mat2);
+  vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(mat2);
+
+  if (mat1.row_major())
+  {
+    detail::matrix_array_wrapper<DestNumericT,      row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<SrcNumericT const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+    for (long row = 0; row < static_cast<long>(A_size1); ++row)
+      for (vcl_size_t col = 0; col < A_size2; ++col)
+        wrapper_A(row, col) = static_cast<DestNumericT>(wrapper_B(row, col));
+  }
+  else
+  {
+    detail::matrix_array_wrapper<DestNumericT,      column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<SrcNumericT const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+    for (long col = 0; col < static_cast<long>(A_size2); ++col)
+      for (vcl_size_t row = 0; row < A_size1; ++row)
+        wrapper_A(row, col) = static_cast<DestNumericT>(wrapper_B(row, col));
+  }
+}
+
+
+
+template<typename NumericT,
+         typename SizeT, typename DistanceT>
+void trans(const matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>,
+           const matrix_base<NumericT, SizeT, DistanceT>, op_trans> & proxy, matrix_base<NumericT> & temp_trans)
+{
+  typedef NumericT        value_type;
+  const value_type * data_A = detail::extract_raw_pointer<value_type>(proxy.lhs());
+  value_type * data_B       = detail::extract_raw_pointer<value_type>(temp_trans);
+
+  vcl_size_t A_start1         = viennacl::traits::start1(proxy.lhs());
+  vcl_size_t A_start2         = viennacl::traits::start2(proxy.lhs());
+  vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(proxy.lhs());
+  vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(proxy.lhs());
+  vcl_size_t A_inc1           = viennacl::traits::stride1(proxy.lhs());
+  vcl_size_t A_inc2           = viennacl::traits::stride2(proxy.lhs());
+  vcl_size_t A_size1          = viennacl::traits::size1(proxy.lhs());
+  vcl_size_t A_size2          = viennacl::traits::size2(proxy.lhs());
+
+  vcl_size_t B_start1         = viennacl::traits::start1(temp_trans);
+  vcl_size_t B_start2         = viennacl::traits::start2(temp_trans);
+  vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(temp_trans);
+  vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(temp_trans);
+  vcl_size_t B_inc1           = viennacl::traits::stride1(temp_trans);
+  vcl_size_t B_inc2           = viennacl::traits::stride2(temp_trans);
+
+  const vcl_size_t sub_mat_size = 64; //The matrix will be divided into sub-matrices for better storage access.
+
+  vcl_size_t row_count = A_size1 / sub_mat_size;
+  vcl_size_t col_count = A_size2 / sub_mat_size;
+
+  vcl_size_t row_count_remainder = A_size1 % sub_mat_size;
+  vcl_size_t col_count_remainder = A_size2 % sub_mat_size;
+
+  if (proxy.lhs().row_major())
+  {
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+    for(long i = 0; i < static_cast<long>(row_count*col_count); ++i)//This is the main part of the transposition
+    {
+      vcl_size_t row = vcl_size_t(i) / col_count;
+      vcl_size_t col = vcl_size_t(i) % col_count;
+
+      detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row * sub_mat_size)
+                                                                               , A_start2 + A_inc2 * (col * sub_mat_size), A_inc1
+                                                                               , A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type      , row_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col * sub_mat_size)
+                                                                               , B_start2 + B_inc2 * (row * sub_mat_size), B_inc1
+                                                                               , B_inc2, B_internal_size1, B_internal_size2);
+      for(vcl_size_t j = 0; j < (sub_mat_size); ++j)
+        for(vcl_size_t k = 0; k < (sub_mat_size); ++k)
+          wrapper_B(j, k) = wrapper_A(k, j);
+    }
+    { //This is the transposition of the remainder on the right side of the matrix
+      detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1
+                                                                               , A_start2 + A_inc2 * (col_count * sub_mat_size), A_inc1
+                                                                               , A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type      , row_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col_count * sub_mat_size)
+                                                                               , B_start2, B_inc1
+                                                                               , B_inc2, B_internal_size1, B_internal_size2);
+      for(vcl_size_t j = 0; j < col_count_remainder; ++j)
+        for(vcl_size_t k = 0 ; k < A_size1; ++k)
+          wrapper_B(j, k) = wrapper_A(k, j);
+    }
+    { //This is the transposition of the remainder on the bottom side of the matrix
+      detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row_count * sub_mat_size)
+                                                                               , A_start2, A_inc1
+                                                                               , A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type      , row_major, false> wrapper_B(data_B,B_start1
+                                                                               , B_start2  + B_inc2 * (row_count * sub_mat_size), B_inc1
+                                                                               , B_inc2, B_internal_size1, B_internal_size2);
+      for(vcl_size_t j = 0; j < row_count_remainder; ++j)
+        for(vcl_size_t k = 0; k < (A_size2 - col_count_remainder); ++k)
+          wrapper_B(k, j) = wrapper_A(j, k);
+    }
+  }
+  else
+  {
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+    for(long i = 0; i < static_cast<long>(row_count*col_count); ++i)//This is the main part of the transposition
+    {
+      vcl_size_t row = vcl_size_t(i) / col_count;
+      vcl_size_t col = vcl_size_t(i) % col_count;
+
+      detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row * sub_mat_size)
+                                                                                  , A_start2 + A_inc2 * (col * sub_mat_size), A_inc1
+                                                                                  , A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type      , column_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col * sub_mat_size)
+                                                                                  , B_start2 + B_inc2 * (row * sub_mat_size), B_inc1
+                                                                                  , B_inc2, B_internal_size1, B_internal_size2);
+      for(vcl_size_t j = 0; j < (sub_mat_size); ++j)
+        for(vcl_size_t k = 0; k < (sub_mat_size); ++k)
+          wrapper_B(k, j)=wrapper_A(j, k);
+    }
+    { //This is the transposition of the remainder on the right side of the matrix
+      detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1
+                                                                                  , A_start2 + A_inc2 * (col_count * sub_mat_size), A_inc1
+                                                                                  , A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type      , column_major, false> wrapper_B(data_B,B_start1 + B_inc1 * (col_count * sub_mat_size)
+                                                                                  , B_start2, B_inc1
+                                                                                  , B_inc2, B_internal_size1, B_internal_size2);
+      for(vcl_size_t j = 0; j < col_count_remainder; ++j)
+        for(vcl_size_t k = 0; k < A_size1; ++k)
+          wrapper_B(j, k)=wrapper_A(k, j);
+    }
+    { //This is the transposition of the remainder on the bottom side of the matrix
+      detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row_count * sub_mat_size)
+                                                                                  , A_start2, A_inc1
+                                                                                  , A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type      , column_major, false> wrapper_B(data_B, B_start1
+                                                                                  , B_start2  + B_inc2 * (row_count * sub_mat_size), B_inc1
+                                                                                  , B_inc2, B_internal_size1, B_internal_size2);
+      for(vcl_size_t j = 0; j < row_count_remainder; ++j)
+        for(vcl_size_t k = 0; k < (A_size2 - col_count_remainder); ++k)
+          wrapper_B(k, j)=wrapper_A(j, k);
+    }
+  }
+}
+
+template<typename NumericT, typename ScalarT1>
+void am(matrix_base<NumericT> & mat1,
+        matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+  assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT        value_type;
+
+  value_type       * data_A = detail::extract_raw_pointer<value_type>(mat1);
+  value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = -data_alpha;
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat1);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat1);
+  vcl_size_t A_size1  = viennacl::traits::size1(mat1);
+  vcl_size_t A_size2  = viennacl::traits::size2(mat1);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat1);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat1);
+
+  vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+  vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+  vcl_size_t B_inc1   = viennacl::traits::stride1(mat2);
+  vcl_size_t B_inc2   = viennacl::traits::stride2(mat2);
+  vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(mat2);
+  vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(mat2);
+
+  if (mat1.row_major())
+  {
+    detail::matrix_array_wrapper<value_type,       row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+    if (reciprocal_alpha)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size1); ++row)
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          wrapper_A(row, col) = wrapper_B(row, col) / data_alpha;
+    }
+    else
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size1); ++row)
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          wrapper_A(row, col) = wrapper_B(row, col) * data_alpha;
+    }
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type,       column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+    if (reciprocal_alpha)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long col = 0; col < static_cast<long>(A_size2); ++col)
+        for (vcl_size_t row = 0; row < A_size1; ++row)
+          wrapper_A(row, col) = wrapper_B(row, col) / data_alpha;
+    }
+    else
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long col = 0; col < static_cast<long>(A_size2); ++col)
+        for (vcl_size_t row = 0; row < A_size1; ++row)
+          wrapper_A(row, col) = wrapper_B(row, col) * data_alpha;
+    }
+  }
+}
+
+
+template<typename NumericT,
+         typename ScalarT1, typename ScalarT2>
+void ambm(matrix_base<NumericT> & mat1,
+          matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+          matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT        value_type;
+
+  value_type       * data_A = detail::extract_raw_pointer<value_type>(mat1);
+  value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
+  value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = -data_alpha;
+
+  value_type data_beta = beta;
+  if (flip_sign_beta)
+    data_beta = -data_beta;
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat1);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat1);
+  vcl_size_t A_size1  = viennacl::traits::size1(mat1);
+  vcl_size_t A_size2  = viennacl::traits::size2(mat1);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat1);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat1);
+
+  vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+  vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+  vcl_size_t B_inc1   = viennacl::traits::stride1(mat2);
+  vcl_size_t B_inc2   = viennacl::traits::stride2(mat2);
+  vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(mat2);
+  vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(mat2);
+
+  vcl_size_t C_start1 = viennacl::traits::start1(mat3);
+  vcl_size_t C_start2 = viennacl::traits::start2(mat3);
+  vcl_size_t C_inc1   = viennacl::traits::stride1(mat3);
+  vcl_size_t C_inc2   = viennacl::traits::stride2(mat3);
+  vcl_size_t C_internal_size1  = viennacl::traits::internal_size1(mat3);
+  vcl_size_t C_internal_size2  = viennacl::traits::internal_size2(mat3);
+
+  if (mat1.row_major())
+  {
+    detail::matrix_array_wrapper<value_type,       row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+    detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+    if (reciprocal_alpha && reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size1); ++row)
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+    }
+    else if (reciprocal_alpha && !reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size1); ++row)
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+    }
+    else if (!reciprocal_alpha && reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size1); ++row)
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+    }
+    else if (!reciprocal_alpha && !reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size1); ++row)
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+    }
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type,       column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+    detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+    if (reciprocal_alpha && reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long col = 0; col < static_cast<long>(A_size2); ++col)
+        for (vcl_size_t row = 0; row < A_size1; ++row)
+          wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+    }
+    else if (reciprocal_alpha && !reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long col = 0; col < static_cast<long>(A_size2); ++col)
+        for (vcl_size_t row = 0; row < A_size1; ++row)
+          wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+    }
+    else if (!reciprocal_alpha && reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long col = 0; col < static_cast<long>(A_size2); ++col)
+        for (vcl_size_t row = 0; row < A_size1; ++row)
+          wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+    }
+    else if (!reciprocal_alpha && !reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long col = 0; col < static_cast<long>(A_size2); ++col)
+        for (vcl_size_t row = 0; row < A_size1; ++row)
+          wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+    }
+  }
+
+}
+
+
+template<typename NumericT,
+         typename ScalarT1, typename ScalarT2>
+void ambm_m(matrix_base<NumericT> & mat1,
+            matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+            matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT        value_type;
+
+  value_type       * data_A = detail::extract_raw_pointer<value_type>(mat1);
+  value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
+  value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = -data_alpha;
+
+  value_type data_beta = beta;
+  if (flip_sign_beta)
+    data_beta = -data_beta;
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat1);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat1);
+  vcl_size_t A_size1  = viennacl::traits::size1(mat1);
+  vcl_size_t A_size2  = viennacl::traits::size2(mat1);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat1);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat1);
+
+  vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+  vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+  vcl_size_t B_inc1   = viennacl::traits::stride1(mat2);
+  vcl_size_t B_inc2   = viennacl::traits::stride2(mat2);
+  vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(mat2);
+  vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(mat2);
+
+  vcl_size_t C_start1 = viennacl::traits::start1(mat3);
+  vcl_size_t C_start2 = viennacl::traits::start2(mat3);
+  vcl_size_t C_inc1   = viennacl::traits::stride1(mat3);
+  vcl_size_t C_inc2   = viennacl::traits::stride2(mat3);
+  vcl_size_t C_internal_size1  = viennacl::traits::internal_size1(mat3);
+  vcl_size_t C_internal_size2  = viennacl::traits::internal_size2(mat3);
+
+  if (mat1.row_major())
+  {
+    detail::matrix_array_wrapper<value_type,       row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+    detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+    if (reciprocal_alpha && reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size1); ++row)
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+    }
+    else if (reciprocal_alpha && !reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size1); ++row)
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+    }
+    else if (!reciprocal_alpha && reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size1); ++row)
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+    }
+    else if (!reciprocal_alpha && !reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size1); ++row)
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+    }
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type,       column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+    detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+    if (reciprocal_alpha && reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long col = 0; col < static_cast<long>(A_size2); ++col)
+        for (vcl_size_t row = 0; row < A_size1; ++row)
+          wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+    }
+    else if (reciprocal_alpha && !reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long col = 0; col < static_cast<long>(A_size2); ++col)
+        for (vcl_size_t row = 0; row < A_size1; ++row)
+          wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+    }
+    else if (!reciprocal_alpha && reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long col = 0; col < static_cast<long>(A_size2); ++col)
+        for (vcl_size_t row = 0; row < A_size1; ++row)
+          wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+    }
+    else if (!reciprocal_alpha && !reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long col = 0; col < static_cast<long>(A_size2); ++col)
+        for (vcl_size_t row = 0; row < A_size1; ++row)
+          wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+    }
+  }
+
+}
+
+
+
+
+template<typename NumericT>
+void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
+{
+  typedef NumericT        value_type;
+
+  value_type * data_A = detail::extract_raw_pointer<value_type>(mat);
+  value_type    alpha = static_cast<value_type>(s);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+  vcl_size_t A_size1  = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat);
+  vcl_size_t A_size2  = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+  if (mat.row_major())
+  {
+    detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+    for (long row = 0; row < static_cast<long>(A_size1); ++row)
+      for (vcl_size_t col = 0; col < A_size2; ++col)
+        wrapper_A(static_cast<vcl_size_t>(row), col) = alpha;
+        //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+        // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha;
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+    for (long col = 0; col < static_cast<long>(A_size2); ++col)
+      for (vcl_size_t row = 0; row < A_size1; ++row)
+        wrapper_A(row, static_cast<vcl_size_t>(col)) = alpha;
+        //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+        // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha;
+  }
+}
+
+
+
+template<typename NumericT>
+void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
+{
+  typedef NumericT        value_type;
+
+  value_type * data_A = detail::extract_raw_pointer<value_type>(mat);
+  value_type    alpha = static_cast<value_type>(s);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+  vcl_size_t A_size1  = viennacl::traits::size1(mat);
+  //vcl_size_t A_size2  = viennacl::traits::size2(mat);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+  if (mat.row_major())
+  {
+    detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if ((A_size1*A_size1) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+    for (long row = 0; row < static_cast<long>(A_size1); ++row)
+      wrapper_A(row, row) = alpha;
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if ((A_size1*A_size1) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+    for (long row = 0; row < static_cast<long>(A_size1); ++row)
+      wrapper_A(row, row) = alpha;
+  }
+}
+
+template<typename NumericT>
+void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT> & mat)
+{
+  typedef NumericT        value_type;
+
+  value_type       *data_A   = detail::extract_raw_pointer<value_type>(mat);
+  value_type const *data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+  //vcl_size_t A_size1  = viennacl::traits::size1(mat);
+  //vcl_size_t A_size2  = viennacl::traits::size2(mat);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+  vcl_size_t v_start = viennacl::traits::start(vec);
+  vcl_size_t v_inc   = viennacl::traits::stride(vec);
+  vcl_size_t v_size  = viennacl::traits::size(vec);
+
+  vcl_size_t row_start = 0;
+  vcl_size_t col_start = 0;
+
+  if (k >= 0)
+    col_start = static_cast<vcl_size_t>(k);
+  else
+    row_start = static_cast<vcl_size_t>(-k);
+
+  matrix_assign(mat, NumericT(0));
+
+  if (mat.row_major())
+  {
+    detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+    for (vcl_size_t i = 0; i < v_size; ++i)
+      wrapper_A(row_start + i, col_start + i) = data_vec[v_start + i * v_inc];
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+    for (vcl_size_t i = 0; i < v_size; ++i)
+      wrapper_A(row_start + i, col_start + i) = data_vec[v_start + i * v_inc];
+  }
+}
+
+template<typename NumericT>
+void matrix_diag_to_vector(const matrix_base<NumericT> & mat, int k, vector_base<NumericT> & vec)
+{
+  typedef NumericT        value_type;
+
+  value_type const * data_A   = detail::extract_raw_pointer<value_type>(mat);
+  value_type       * data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+  //vcl_size_t A_size1  = viennacl::traits::size1(mat);
+  //vcl_size_t A_size2  = viennacl::traits::size2(mat);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+  vcl_size_t v_start = viennacl::traits::start(vec);
+  vcl_size_t v_inc   = viennacl::traits::stride(vec);
+  vcl_size_t v_size  = viennacl::traits::size(vec);
+
+  vcl_size_t row_start = 0;
+  vcl_size_t col_start = 0;
+
+  if (k >= 0)
+    col_start = static_cast<vcl_size_t>(k);
+  else
+    row_start = static_cast<vcl_size_t>(-k);
+
+  if (mat.row_major())
+  {
+    detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+    for (vcl_size_t i = 0; i < v_size; ++i)
+      data_vec[v_start + i * v_inc] = wrapper_A(row_start + i, col_start + i);
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+    for (vcl_size_t i = 0; i < v_size; ++i)
+      data_vec[v_start + i * v_inc] = wrapper_A(row_start + i, col_start + i);
+  }
+}
+
+template<typename NumericT>
+void matrix_row(const matrix_base<NumericT> & mat, unsigned int i, vector_base<NumericT> & vec)
+{
+  typedef NumericT        value_type;
+
+  value_type const * data_A   = detail::extract_raw_pointer<value_type>(mat);
+  value_type       * data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+  //vcl_size_t A_size1  = viennacl::traits::size1(mat);
+  //vcl_size_t A_size2  = viennacl::traits::size2(mat);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+  vcl_size_t v_start = viennacl::traits::start(vec);
+  vcl_size_t v_inc   = viennacl::traits::stride(vec);
+  vcl_size_t v_size  = viennacl::traits::size(vec);
+
+  if (mat.row_major())
+  {
+    detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+    for (vcl_size_t j = 0; j < v_size; ++j)
+      data_vec[v_start + j * v_inc] = wrapper_A(static_cast<vcl_size_t>(i), j);
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+    for (vcl_size_t j = 0; j < v_size; ++j)
+      data_vec[v_start + j * v_inc] = wrapper_A(static_cast<vcl_size_t>(i), j);
+  }
+}
+
+template<typename NumericT>
+void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec)
+{
+  typedef NumericT        value_type;
+
+  value_type const * data_A   = detail::extract_raw_pointer<value_type>(mat);
+  value_type       * data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+  //vcl_size_t A_size1  = viennacl::traits::size1(mat);
+  //vcl_size_t A_size2  = viennacl::traits::size2(mat);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+  vcl_size_t v_start = viennacl::traits::start(vec);
+  vcl_size_t v_inc   = viennacl::traits::stride(vec);
+  vcl_size_t v_size  = viennacl::traits::size(vec);
+
+  if (mat.row_major())
+  {
+    detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+    for (vcl_size_t i = 0; i < v_size; ++i)
+      data_vec[v_start + i * v_inc] = wrapper_A(i, static_cast<vcl_size_t>(j));
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+    for (vcl_size_t i = 0; i < v_size; ++i)
+      data_vec[v_start + i * v_inc] = wrapper_A(i, static_cast<vcl_size_t>(j));
+  }
+}
+
+//
+///////////////////////// Element-wise operation //////////////////////////////////
+//
+
+// Binary operations A = B .* C and A = B ./ C
+
+/** @brief Implementation of the element-wise operations A = B .* C and A = B ./ C    (using MATLAB syntax)
+*
+* @param A      The result matrix (or -range, or -slice)
+* @param proxy  The proxy object holding B, C, and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_binary<OpT> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT        value_type;
+  typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> >    OpFunctor;
+
+  value_type       * data_A = detail::extract_raw_pointer<value_type>(A);
+  value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs());
+  value_type const * data_C = detail::extract_raw_pointer<value_type>(proxy.rhs());
+
+  vcl_size_t A_start1 = viennacl::traits::start1(A);
+  vcl_size_t A_start2 = viennacl::traits::start2(A);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(A);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(A);
+  vcl_size_t A_size1  = viennacl::traits::size1(A);
+  vcl_size_t A_size2  = viennacl::traits::size2(A);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A);
+
+  vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs());
+  vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs());
+  vcl_size_t B_inc1   = viennacl::traits::stride1(proxy.lhs());
+  vcl_size_t B_inc2   = viennacl::traits::stride2(proxy.lhs());
+  vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(proxy.lhs());
+  vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(proxy.lhs());
+
+  vcl_size_t C_start1 = viennacl::traits::start1(proxy.rhs());
+  vcl_size_t C_start2 = viennacl::traits::start2(proxy.rhs());
+  vcl_size_t C_inc1   = viennacl::traits::stride1(proxy.rhs());
+  vcl_size_t C_inc2   = viennacl::traits::stride2(proxy.rhs());
+  vcl_size_t C_internal_size1  = viennacl::traits::internal_size1(proxy.rhs());
+  vcl_size_t C_internal_size2  = viennacl::traits::internal_size2(proxy.rhs());
+
+  if (A.row_major())
+  {
+    detail::matrix_array_wrapper<value_type,       row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+    detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+    for (long row = 0; row < static_cast<long>(A_size1); ++row)
+      for (vcl_size_t col = 0; col < A_size2; ++col)
+        OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col));
+        //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+        // =   data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha
+        //   + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta;
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type,       column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+    detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+    for (long col = 0; col < static_cast<long>(A_size2); ++col)
+      for (vcl_size_t row = 0; row < A_size1; ++row)
+        OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col));
+
+        //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+        // =   data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha
+        //   + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta;
+  }
+}
+
+// Unary operations
+
+// A = op(B)
+template<typename NumericT, typename OpT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<OpT> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT        value_type;
+  typedef viennacl::linalg::detail::op_applier<op_element_unary<OpT> >    OpFunctor;
+
+  value_type       * data_A = detail::extract_raw_pointer<value_type>(A);
+  value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs());
+
+  vcl_size_t A_start1 = viennacl::traits::start1(A);
+  vcl_size_t A_start2 = viennacl::traits::start2(A);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(A);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(A);
+  vcl_size_t A_size1  = viennacl::traits::size1(A);
+  vcl_size_t A_size2  = viennacl::traits::size2(A);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A);
+
+  vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs());
+  vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs());
+  vcl_size_t B_inc1   = viennacl::traits::stride1(proxy.lhs());
+  vcl_size_t B_inc2   = viennacl::traits::stride2(proxy.lhs());
+  vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(proxy.lhs());
+  vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(proxy.lhs());
+
+  if (A.row_major())
+  {
+    detail::matrix_array_wrapper<value_type,       row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+    for (long row = 0; row < static_cast<long>(A_size1); ++row)
+      for (vcl_size_t col = 0; col < A_size2; ++col)
+        OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col));
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type,       column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+    for (long col = 0; col < static_cast<long>(A_size2); ++col)
+      for (vcl_size_t row = 0; row < A_size1; ++row)
+        OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col));
+  }
+}
+
+
+
+//
+/////////////////////////   matrix-vector products /////////////////////////////////
+//
+
+// A * x
+
+/** @brief Carries out matrix-vector multiplication
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param trans  Flag whether mat is to be transposed
+* @param vec    The vector
+* @param result The result vector
+*/
+template<typename NumericT>
+void prod_impl(const matrix_base<NumericT> & mat, bool trans,
+               const vector_base<NumericT> & vec,
+                     vector_base<NumericT> & result)
+{
+  typedef NumericT        value_type;
+
+  value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+  value_type const * data_x = detail::extract_raw_pointer<value_type>(vec);
+  value_type       * data_result = detail::extract_raw_pointer<value_type>(result);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+  vcl_size_t A_size1  = viennacl::traits::size1(mat);
+  vcl_size_t A_size2  = viennacl::traits::size2(mat);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+  vcl_size_t start1 = viennacl::traits::start(vec);
+  vcl_size_t inc1   = viennacl::traits::stride(vec);
+
+  vcl_size_t start2 = viennacl::traits::start(result);
+  vcl_size_t inc2   = viennacl::traits::stride(result);
+
+  if (mat.row_major())
+  {
+    if (trans)
+    {
+      vcl_size_t thread_count = 1;
+#ifdef VIENNACL_WITH_OPENMP
+      if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+        thread_count = omp_get_max_threads();
+#endif
+      std::vector<value_type> temp_array(A_size2*thread_count, 0);
+      detail::vector_array_wrapper<value_type> wrapper_res(data_result, start2, inc2);
+
+      for (vcl_size_t col = 0; col < A_size2; ++col)
+        wrapper_res(col) = 0;
+
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      {
+        vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+        if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+          id = omp_get_thread_num();
+ #endif
+        vcl_size_t begin = (A_size1 * id) / thread_count;
+        vcl_size_t end   = (A_size1 * (id + 1)) / thread_count;
+
+        detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_mat(data_A, A_start1 + A_inc1 * begin, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::vector_array_wrapper<value_type const> wrapper_vec(data_x, start1 + inc1 * begin, inc1);
+
+        for (vcl_size_t row = 0; row < (end - begin); ++row)  //run through matrix sequentially
+        {
+          value_type temp = wrapper_vec(row);
+          for (vcl_size_t col = 0; col < A_size2; ++col)
+            temp_array[A_size2 * id + col] += wrapper_mat(row , col) * temp;
+        }
+      }
+      for (vcl_size_t id = 0; id < thread_count; ++id)
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          wrapper_res(col) += temp_array[A_size2 * id + col];
+    }
+
+    else
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size1); ++row)
+      {
+        value_type temp = 0;
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          temp += data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1];
+
+        data_result[static_cast<vcl_size_t>(row) * inc2 + start2] = temp;
+      }
+    }
+  }
+  else
+  {
+    if (!trans)
+    {
+      vcl_size_t thread_count = 1;
+#ifdef VIENNACL_WITH_OPENMP
+      if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+        thread_count = omp_get_max_threads();
+#endif
+      std::vector<value_type> temp_array(A_size1*thread_count, 0);
+      detail::vector_array_wrapper<value_type> wrapper_res(data_result, start2, inc2);
+
+      for (vcl_size_t row = 0; row < A_size1; ++row)
+        wrapper_res(row) = 0;
+
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      {
+        vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+        if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+          id = omp_get_thread_num();
+ #endif
+        vcl_size_t begin = (A_size2 * id) / thread_count;
+        vcl_size_t end   = (A_size2 * (id + 1)) / thread_count;
+
+        detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_mat(data_A, A_start1, A_start2 + A_inc2 * begin, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::vector_array_wrapper<value_type const> wrapper_vec(data_x, start1 + inc1 * begin, inc1);
+
+        for (vcl_size_t col = 0; col < (end - begin); ++col)  //run through matrix sequentially
+        {
+          value_type temp = wrapper_vec(col);
+          for (vcl_size_t row = 0; row < A_size1; ++row)
+            temp_array[A_size1 * id + row] += wrapper_mat(row , col) * temp;
+        }
+      }
+      for (vcl_size_t id = 0; id < thread_count; ++id)
+        for (vcl_size_t row = 0; row < A_size1; ++row)
+          wrapper_res(row) += temp_array[A_size1 * id + row];
+    }
+    else
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size2); ++row)
+      {
+        value_type temp = 0;
+        for (vcl_size_t col = 0; col < A_size1; ++col)
+          temp += data_A[viennacl::column_major::mem_index(col * A_inc1 + A_start1, static_cast<vcl_size_t>(row) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1];
+
+        data_result[static_cast<vcl_size_t>(row) * inc2 + start2] = temp;
+      }
+    }
+  }
+}
+
+
+
+//
+/////////////////////////   matrix-matrix products /////////////////////////////////
+//
+
+namespace detail
+{
+  template<typename MatrixAccT1, typename MatrixAccT2, typename MatrixAccT3, typename NumericT>
+  void prod(MatrixAccT1 & A, MatrixAccT2 & B, MatrixAccT3 & C,
+            vcl_size_t C_size1, vcl_size_t C_size2, vcl_size_t A_size2,
+            NumericT alpha, NumericT beta)
+  {
+    if (C_size1 == 0 || C_size2 == 0 || A_size2 == 0)
+      return;
+
+    static const vcl_size_t blocksize = 64;
+
+    vcl_size_t num_blocks_C1 = (C_size1 - 1) / blocksize + 1;
+    vcl_size_t num_blocks_C2 = (C_size2 - 1) / blocksize + 1;
+    vcl_size_t num_blocks_A2 = (A_size2 - 1) / blocksize + 1;
+
+    //
+    // outer loop pair: Run over all blocks with indices (block_idx_i, block_idx_j) of the result matrix C:
+    //
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if ((C_size1*C_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+    for (long block_idx_i2=0; block_idx_i2<static_cast<long>(num_blocks_C1); ++block_idx_i2)
+    {
+      // thread-local auxiliary buffers
+      std::vector<NumericT> buffer_A(blocksize * blocksize); // row-major
+      std::vector<NumericT> buffer_B(blocksize * blocksize); // column-major
+      std::vector<NumericT> buffer_C(blocksize * blocksize); // row-major
+
+      vcl_size_t block_idx_i = static_cast<vcl_size_t>(block_idx_i2);
+      for (vcl_size_t block_idx_j=0; block_idx_j<num_blocks_C2; ++block_idx_j)
+      {
+        // Reset block matrix:
+        std::fill(buffer_C.begin(), buffer_C.end(), NumericT(0));
+
+        vcl_size_t offset_i = block_idx_i*blocksize;
+        vcl_size_t offset_j = block_idx_j*blocksize;
+
+        //  C(block_idx_i, block_idx_i) += A(block_idx_i, block_idx_k) * B(block_idx_k, block_idx_j)
+        for (vcl_size_t block_idx_k=0; block_idx_k<num_blocks_A2; ++block_idx_k)
+        {
+          // flush buffers:
+          std::fill(buffer_A.begin(), buffer_A.end(), NumericT(0));
+          std::fill(buffer_B.begin(), buffer_B.end(), NumericT(0));
+
+          vcl_size_t offset_k = block_idx_k*blocksize;
+
+          // load current data:
+          for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i)
+            for (vcl_size_t k = offset_k; k < std::min(offset_k + blocksize, A_size2); ++k)
+              buffer_A[(i - offset_i) * blocksize + (k - offset_k)] = A(i, k);
+
+          for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j)
+            for (vcl_size_t k = offset_k; k < std::min(offset_k + blocksize, A_size2); ++k)
+              buffer_B[(k - offset_k) + (j - offset_j) * blocksize] = B(k, j);
+
+          // multiply (this is the hot spot in terms of flops)
+          for (vcl_size_t i = 0; i < blocksize; ++i)
+          {
+            NumericT const * ptrA = &(buffer_A[i*blocksize]);
+            for (vcl_size_t j = 0; j < blocksize; ++j)
+            {
+              NumericT const * ptrB = &(buffer_B[j*blocksize]);
+
+              NumericT temp = NumericT(0);
+              for (vcl_size_t k = 0; k < blocksize; ++k)
+                temp += ptrA[k] * ptrB[k];  // buffer_A[i*blocksize + k] * buffer_B[k + j*blocksize];
+
+              buffer_C[i*blocksize + j] += temp;
+            }
+          }
+        }
+
+        // write result:
+        if (beta > 0 || beta < 0)
+        {
+          for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i)
+            for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j)
+              C(i,j) = beta * C(i,j) + alpha * buffer_C[(i - offset_i) * blocksize + (j - offset_j)];
+        }
+        else
+        {
+          for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i)
+            for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j)
+              C(i,j) =                 alpha * buffer_C[(i - offset_i) * blocksize + (j - offset_j)];
+        }
+
+      } // for block j
+    } // for block i
+
+  } // prod()
+
+} // namespace detail
+
+/** @brief Carries out matrix-matrix multiplication
+*
+* Implementation of C = prod(A, B);
+*
+*/
+template<typename NumericT, typename ScalarT1, typename ScalarT2 >
+void prod_impl(const matrix_base<NumericT> & A, bool trans_A,
+               const matrix_base<NumericT> & B, bool trans_B,
+                     matrix_base<NumericT> & C,
+               ScalarT1 alpha,
+               ScalarT2 beta)
+{
+  typedef NumericT        value_type;
+
+  value_type const * data_A = detail::extract_raw_pointer<value_type>(A);
+  value_type const * data_B = detail::extract_raw_pointer<value_type>(B);
+  value_type       * data_C = detail::extract_raw_pointer<value_type>(C);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(A);
+  vcl_size_t A_start2 = viennacl::traits::start2(A);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(A);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(A);
+  vcl_size_t A_size1  = viennacl::traits::size1(A);
+  vcl_size_t A_size2  = viennacl::traits::size2(A);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A);
+
+  vcl_size_t B_start1 = viennacl::traits::start1(B);
+  vcl_size_t B_start2 = viennacl::traits::start2(B);
+  vcl_size_t B_inc1   = viennacl::traits::stride1(B);
+  vcl_size_t B_inc2   = viennacl::traits::stride2(B);
+  vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(B);
+  vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(B);
+
+  vcl_size_t C_start1 = viennacl::traits::start1(C);
+  vcl_size_t C_start2 = viennacl::traits::start2(C);
+  vcl_size_t C_inc1   = viennacl::traits::stride1(C);
+  vcl_size_t C_inc2   = viennacl::traits::stride2(C);
+  vcl_size_t C_size1  = viennacl::traits::size1(C);
+  vcl_size_t C_size2  = viennacl::traits::size2(C);
+  vcl_size_t C_internal_size1  = viennacl::traits::internal_size1(C);
+  vcl_size_t C_internal_size2  = viennacl::traits::internal_size2(C);
+
+  if (!trans_A && !trans_B)
+  {
+    if (A.row_major() && B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (A.row_major() && B.row_major() && !C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (A.row_major() && !B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (A.row_major() && !B.row_major() && !C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (!A.row_major() && B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (!A.row_major() && B.row_major() && !C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (!A.row_major() && !B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+  }
+  else if (!trans_A && trans_B)
+  {
+    if (A.row_major() && B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, row_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (A.row_major() && B.row_major() && !C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, row_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (A.row_major() && !B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (A.row_major() && !B.row_major() && !C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (!A.row_major() && B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, row_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (!A.row_major() && B.row_major() && !C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, row_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (!A.row_major() && !B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+  }
+  else if (trans_A && !trans_B)
+  {
+    if (A.row_major() && B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, row_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (A.row_major() && B.row_major() && !C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, row_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (A.row_major() && !B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, row_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (A.row_major() && !B.row_major() && !C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, row_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (!A.row_major() && B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (!A.row_major() && B.row_major() && !C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (!A.row_major() && !B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+  }
+  else if (trans_A && trans_B)
+  {
+    if (A.row_major() && B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, row_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, row_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (A.row_major() && B.row_major() && !C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const,    row_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const,    row_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (A.row_major() && !B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const,    row_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,          row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (A.row_major() && !B.row_major() && !C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const,    row_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (!A.row_major() && B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const,    row_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,          row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (!A.row_major() && B.row_major() && !C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const,    row_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else if (!A.row_major() && !B.row_major() && C.row_major())
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,          row_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+    else
+    {
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+      detail::matrix_array_wrapper<value_type const, column_major, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+      detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+      detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+    }
+  }
+}
+
+
+
+
+//
+/////////////////////////   miscellaneous operations /////////////////////////////////
+//
+
+
+/** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update
+*
+* Implementation of the convenience expression result += alpha * outer_prod(vec1, vec2);
+*
+* @param mat1    The matrix to be updated
+* @param alpha            The scaling factor (either a viennacl::scalar<>, float, or double)
+* @param reciprocal_alpha Use 1/alpha instead of alpha
+* @param flip_sign_alpha  Use -alpha instead of alpha
+* @param vec1    The first vector
+* @param vec2    The second vector
+*/
+template<typename NumericT, typename ScalarT>
+void scaled_rank_1_update(matrix_base<NumericT> & mat1,
+                          ScalarT const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+                          const vector_base<NumericT> & vec1,
+                          const vector_base<NumericT> & vec2)
+{
+  typedef NumericT        value_type;
+
+  value_type       * data_A  = detail::extract_raw_pointer<value_type>(mat1);
+  value_type const * data_v1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type const * data_v2 = detail::extract_raw_pointer<value_type>(vec2);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat1);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat1);
+  vcl_size_t A_size1  = viennacl::traits::size1(mat1);
+  vcl_size_t A_size2  = viennacl::traits::size2(mat1);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat1);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(vec2);
+  vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = -data_alpha;
+
+  if (mat1.row_major())
+  {
+    if(reciprocal_alpha)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size1); ++row)
+      {
+        value_type value_v1 = data_v1[static_cast<vcl_size_t>(row) * inc1 + start1] / data_alpha;
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += value_v1 * data_v2[col * inc2 + start2];
+      }
+    }
+    else
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+      for (long row = 0; row < static_cast<long>(A_size1); ++row)
+      {
+        value_type value_v1 = data_v1[static_cast<vcl_size_t>(row) * inc1 + start1] * data_alpha;
+        for (vcl_size_t col = 0; col < A_size2; ++col)
+          data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += value_v1 * data_v2[col * inc2 + start2];
+      }
+    }
+  }
+  else
+  {
+      if(reciprocal_alpha)
+      {
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+        for (long col = 0; col < static_cast<long>(A_size2); ++col)  //run through matrix sequentially
+        {
+          value_type value_v2 = data_v2[static_cast<vcl_size_t>(col) * inc2 + start2] / data_alpha;
+          for (vcl_size_t row = 0; row < A_size1; ++row)
+            data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, static_cast<vcl_size_t>(col) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += data_v1[row * inc1 + start1] * value_v2;
+        }
+      }
+      else
+      {
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+        for (long col = 0; col < static_cast<long>(A_size2); ++col)  //run through matrix sequentially
+        {
+          value_type value_v2 = data_v2[static_cast<vcl_size_t>(col) * inc2 + start2] * data_alpha;
+          for (vcl_size_t row = 0; row < A_size1; ++row)
+            data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, static_cast<vcl_size_t>(col) * A_inc2 + A_start2, A_intern

<TRUNCATED>

[43/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp
new file mode 100644
index 0000000..19f7993
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp
@@ -0,0 +1,512 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_MAPPED_TYPE_HPP
+#define VIENNACL_DEVICE_SPECIFIC_MAPPED_TYPE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/mapped_objects.hpp
+    @brief Map ViennaCL objects to generator wrappers
+*/
+
+#include <string>
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/utils.hpp"
+
+namespace viennacl
+{
+
+namespace device_specific
+{
+
+/** @brief Mapped Object
+*
+* This object populates the symbolic mapping associated with a statement. (root_id, LHS|RHS|PARENT) => mapped_object
+* The tree can then be reconstructed in its symbolic form
+*/
+class mapped_object
+{
+private:
+  virtual void postprocess(std::string &) const { }
+
+protected:
+  struct MorphBase { virtual ~MorphBase(){} };
+  struct MorphBase1D : public MorphBase { public: virtual std::string operator()(std::string const & i) const = 0; };
+  struct MorphBase2D : public MorphBase { public: virtual std::string operator()(std::string const & i, std::string const & j) const = 0; };
+
+  static void replace_offset(std::string & str, MorphBase const & morph)
+  {
+    vcl_size_t pos = 0;
+    while ((pos=str.find("$OFFSET", pos))!=std::string::npos)
+    {
+      std::string postprocessed;
+      vcl_size_t pos_po = str.find('{', pos);
+      vcl_size_t pos_pe = str.find('}', pos_po);
+
+      if (MorphBase2D const * p2d = dynamic_cast<MorphBase2D const *>(&morph))
+      {
+        vcl_size_t pos_comma = str.find(',', pos_po);
+        std::string i = str.substr(pos_po + 1, pos_comma - pos_po - 1);
+        std::string j = str.substr(pos_comma + 1, pos_pe - pos_comma - 1);
+        postprocessed = (*p2d)(i, j);
+      }
+      else if (MorphBase1D const * p1d = dynamic_cast<MorphBase1D const *>(&morph))
+      {
+        std::string i = str.substr(pos_po + 1, pos_pe - pos_po - 1);
+        postprocessed = (*p1d)(i);
+      }
+
+      str.replace(pos, pos_pe + 1 - pos, postprocessed);
+      pos = pos_pe;
+    }
+  }
+
+  void register_attribute(std::string & attribute, std::string const & key, std::string const & value)
+  {
+    attribute = value;
+    keywords_[key] = attribute;
+  }
+
+public:
+  struct node_info
+  {
+    node_info(mapping_type const * _mapping, scheduler::statement const * _statement, vcl_size_t _root_idx) :
+      mapping(_mapping), statement(_statement), root_idx(_root_idx) { }
+    mapping_type const * mapping;
+    scheduler::statement const * statement;
+    vcl_size_t root_idx;
+  };
+
+public:
+  mapped_object(std::string const & scalartype, unsigned int id, std::string const & type_key) : type_key_(type_key)
+  {
+    register_attribute(scalartype_, "#scalartype", scalartype);
+    register_attribute(name_, "#name", "obj" + tools::to_string(id));
+  }
+
+  virtual ~mapped_object(){ }
+
+  virtual std::string & append_kernel_arguments(std::set<std::string> &, std::string & str, unsigned int) const { return str; }
+
+  std::string type_key() const { return type_key_; }
+
+  std::string const & name() const { return name_; }
+
+  std::string process(std::string const & in) const
+  {
+    std::string res(in);
+    for (std::map<std::string,std::string>::const_iterator it = keywords_.begin(); it != keywords_.end(); ++it)
+      tools::find_and_replace(res, it->first, it->second);
+    postprocess(res);
+    return res;
+  }
+
+  std::string evaluate(std::map<std::string, std::string> const & accessors) const
+  {
+    if (accessors.find(type_key_)==accessors.end())
+      return name_;
+    return process(at(accessors, type_key_));
+  }
+
+
+protected:
+  std::string name_;
+  std::string scalartype_;
+  std::string type_key_;
+  std::map<std::string, std::string> keywords_;
+};
+
+
+/** @brief Binary leaf interface
+*
+*  Some subtrees have to be interpret at leaves when reconstructing the final expression. It is the case of trans(), diag(), prod(), etc...
+*  This interface stores basic infos about the sub-trees
+*/
+class binary_leaf
+{
+public:
+  binary_leaf(mapped_object::node_info info) : info_(info){ }
+
+  void process_recursive(utils::kernel_generation_stream & stream, leaf_t leaf, std::string const & key, std::string const & process_str, std::set<std::string> & already_fetched)
+  {
+    tree_parsing::process(stream, leaf, key, process_str, *info_.statement, info_.root_idx, *info_.mapping, already_fetched);
+  }
+
+  std::string evaluate_recursive(leaf_t leaf, std::map<std::string, std::string> const & accessors)
+  {
+    return tree_parsing::evaluate(leaf, accessors, *info_.statement, info_.root_idx, *info_.mapping);
+  }
+
+protected:
+  mapped_object::node_info info_;
+};
+
+/** @brief Matrix product
+  *
+  * Maps prod(matrix_expression, matrix_expression)
+  */
+class mapped_matrix_product : public mapped_object, public binary_leaf
+{
+public:
+  mapped_matrix_product(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_product"), binary_leaf(info) { }
+};
+
+/** @brief Reduction
+*
+* Base class for mapping a reduction
+*/
+class mapped_reduction : public mapped_object, public binary_leaf
+{
+public:
+  mapped_reduction(std::string const & scalartype, unsigned int id, node_info info, std::string const & type_key) : mapped_object(scalartype, id, type_key), binary_leaf(info){ }
+
+  vcl_size_t root_idx() const { return info_.root_idx; }
+  scheduler::statement const & statement() const { return *info_.statement; }
+  scheduler::statement_node root_node() const { return statement().array()[root_idx()]; }
+  bool is_index_reduction() const { return utils::is_index_reduction(info_.statement->array()[info_.root_idx].op); }
+
+  scheduler::op_element root_op() const
+  {
+    scheduler::op_element res = info_.statement->array()[info_.root_idx].op;
+    if (res.type==scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE
+        ||res.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
+      res.type        = scheduler::OPERATION_BINARY_ADD_TYPE;
+    return res;
+  }
+};
+
+/** @brief Scalar reduction
+*
+* Maps a scalar reduction (max, min, argmax, inner_prod, etc..)
+*/
+class mapped_scalar_reduction : public mapped_reduction
+{
+public:
+  mapped_scalar_reduction(std::string const & scalartype, unsigned int id, node_info info) : mapped_reduction(scalartype, id, info, "scalar_reduction"){ }
+};
+
+/** @brief Vector reduction
+*
+* Maps a row-wise reduction (max, min, argmax, matrix-vector product, etc..)
+*/
+class mapped_row_wise_reduction : public mapped_reduction
+{
+public:
+  mapped_row_wise_reduction(std::string const & scalartype, unsigned int id, node_info info) : mapped_reduction(scalartype, id, info, "row_wise_reduction") { }
+};
+
+/** @brief Host scalar
+ *
+ * Maps a host scalar (passed by value)
+ */
+class mapped_host_scalar : public mapped_object
+{
+public:
+  mapped_host_scalar(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "host_scalar"){ }
+
+  std::string & append_kernel_arguments(std::set<std::string> & already_generated, std::string & str, unsigned int width) const
+  {
+    if (already_generated.insert(name_).second)
+      str += generate_value_kernel_argument(utils::append_width(scalartype_, width), name_);
+    return str;
+  }
+};
+
+/** @brief Handle
+*
+* Maps an object passed by pointer
+*/
+class mapped_handle : public mapped_object
+{
+private:
+  virtual void append_optional_arguments(std::string &) const = 0;
+
+public:
+  mapped_handle(std::string const & scalartype, unsigned int id, std::string const & type_key) : mapped_object(scalartype, id, type_key)
+  {
+    register_attribute(pointer_, "#pointer", name_ + "_pointer");
+  }
+
+  std::string & append_kernel_arguments(std::set<std::string> & already_generated, std::string & str, unsigned int width) const
+  {
+    if (already_generated.insert(name_).second)
+    {
+      str += generate_pointer_kernel_argument("__global", utils::append_width(scalartype_, width), pointer_);
+      append_optional_arguments(str);
+    }
+    return str;
+  }
+
+private:
+  std::string pointer_;
+};
+
+
+/** @brief Scalar
+ *
+ * Maps a scalar passed by pointer
+ */
+class mapped_scalar : public mapped_handle
+{
+private:
+  void append_optional_arguments(std::string &) const{ }
+
+public:
+  mapped_scalar(std::string const & scalartype, unsigned int id) : mapped_handle(scalartype, id, "scalar") { }
+};
+
+/** @brief Buffered
+ *
+ * Maps a buffered object (vector, matrix)
+ */
+class mapped_buffer : public mapped_handle
+{
+public:
+  mapped_buffer(std::string const & scalartype, unsigned int id, std::string const & type_key) : mapped_handle(scalartype, id, type_key){ }
+};
+
+/** @brief Vector
+ *
+ * Maps a vector
+ */
+class mapped_vector : public mapped_buffer
+{
+  void append_optional_arguments(std::string & str) const
+  {
+    str += generate_value_kernel_argument("unsigned int", start_);
+    str += generate_value_kernel_argument("unsigned int", stride_);
+  }
+
+public:
+  mapped_vector(std::string const & scalartype, unsigned int id) : mapped_buffer(scalartype, id, "vector")
+  {
+    register_attribute(start_, "#start", name_ + "_start");
+    register_attribute(stride_, "#stride", name_ + "_stride");
+  }
+
+private:
+  std::string start_;
+  std::string stride_;
+};
+
+/** @brief Matrix
+ *
+ * Maps a matrix
+ */
+class mapped_matrix : public mapped_buffer
+{
+private:
+  void append_optional_arguments(std::string & str) const
+  {
+    str += generate_value_kernel_argument("unsigned int", ld_);
+    str += generate_value_kernel_argument("unsigned int", start1_);
+    str += generate_value_kernel_argument("unsigned int", start2_);
+    str += generate_value_kernel_argument("unsigned int", stride1_);
+    str += generate_value_kernel_argument("unsigned int", stride2_);
+  }
+
+  void postprocess(std::string & str) const
+  {
+    struct Morph : public MorphBase2D
+    {
+      Morph(bool _is_row_major, std::string const & _ld) : is_row_major(_is_row_major), ld(_ld){ }
+      std::string operator()(std::string const & i, std::string const & j) const
+      {
+        if (is_row_major)
+          return "(" + i + ") * " + ld +  " + (" + j + ")";
+        return "(" + i + ") +  (" + j + ") * " + ld;
+      }
+    private:
+      bool is_row_major;
+      std::string const & ld;
+    };
+    replace_offset(str, Morph(row_major_, ld_));
+  }
+
+public:
+  mapped_matrix(std::string const & scalartype, unsigned int id, bool row_major) : mapped_buffer(scalartype, id, "matrix"), row_major_(row_major)
+  {
+    register_attribute(ld_, "#ld", name_ + "_ld");
+    register_attribute(start1_, "#start1", name_ + "_start1");
+    register_attribute(start2_, "#start2", name_ + "_start2");
+    register_attribute(stride1_, "#stride1", name_ + "_stride1");
+    register_attribute(stride2_, "#stride2", name_ + "_stride2");
+    if (row_major_)
+      keywords_["#nldstride"] = "#stride1";
+    else
+      keywords_["#nldstride"] = "#stride2";
+
+    if (row_major_)
+    {
+      std::swap(start1_, start2_);
+      std::swap(stride1_, stride2_);
+    }
+  }
+
+  bool row_major() const
+  {
+    return row_major_;
+  }
+
+private:
+  std::string ld_;
+  std::string start1_;
+  std::string start2_;
+  std::string stride1_;
+  std::string stride2_;
+  bool row_major_;
+};
+
+/** @brief Vector diag
+*
+*  Maps a diag(vector_expression) node into a diagonal matrix
+*/
+class mapped_vector_diag : public mapped_object, public binary_leaf
+{
+private:
+  void postprocess(std::string &res) const
+  {
+    std::map<std::string, std::string> accessors;
+    tools::find_and_replace(res, "#diag_offset", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
+    accessors["vector"] = res;
+    res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
+  }
+
+public:
+  mapped_vector_diag(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "vector_diag"), binary_leaf(info){ }
+};
+
+
+/** @brief Trans
+*
+*  Maps trans(matrix_expression) into the transposed of matrix_expression
+*/
+class mapped_trans: public mapped_object, public binary_leaf
+{
+private:
+  void postprocess(std::string &res) const
+  {
+    std::map<std::string, std::string> accessors;
+    accessors["matrix"] = res;
+    res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
+  }
+
+public:
+  mapped_trans(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_trans"), binary_leaf(info){ }
+};
+
+/** @brief Matrix row
+*
+*  Maps row(matrix_expression, scalar_expression) into the scalar_expression's row of matrix_expression
+*/
+class mapped_matrix_row : public mapped_object, binary_leaf
+{
+private:
+  void postprocess(std::string &res) const
+  {
+    std::map<std::string, std::string> accessors;
+    tools::find_and_replace(res, "#row", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
+    accessors["matrix"] = res;
+    res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
+  }
+
+public:
+  mapped_matrix_row(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_row"), binary_leaf(info)
+  { }
+};
+
+
+/** @brief Matrix column
+*
+*  Maps column(matrix_expression, scalar_expression) into the scalar_expression's column of matrix_expression
+*/
+class mapped_matrix_column : public mapped_object, binary_leaf
+{
+private:
+  void postprocess(std::string &res) const
+  {
+    std::map<std::string, std::string> accessors;
+    tools::find_and_replace(res, "#column", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
+    accessors["matrix"] = res;
+    res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
+  }
+
+public:
+  mapped_matrix_column(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_column"), binary_leaf(info)
+  { }
+};
+
+/** @brief Matrix diag
+*
+*  Maps a diag(matrix_expression) node into the vector of its diagonal elements
+*/
+class mapped_matrix_diag : public mapped_object, binary_leaf
+{
+private:
+  void postprocess(std::string &res) const
+  {
+    std::map<std::string, std::string> accessors;
+    tools::find_and_replace(res, "#diag_offset", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
+    accessors["matrix"] = res;
+    res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
+  }
+
+public:
+  mapped_matrix_diag(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_diag"), binary_leaf(info)
+  { }
+};
+
+/** @brief Implicit vector
+ *
+ * Maps an implicit vector
+ */
+class mapped_implicit_vector : public mapped_object
+{
+public:
+  mapped_implicit_vector(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "implicit_vector")
+  { }
+
+  std::string & append_kernel_arguments(std::set<std::string> & /*already_generated*/, std::string & str, unsigned int width) const
+  {
+    str += generate_value_kernel_argument(utils::append_width(scalartype_, width), name_);
+    return str;
+  }
+};
+
+/** @brief Implicit matrix
+ *
+ * Maps an implicit matrix
+ */
+class mapped_implicit_matrix : public mapped_object
+{
+public:
+  mapped_implicit_matrix(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "implicit_matrix")
+  { }
+
+  std::string & append_kernel_arguments(std::set<std::string> & /*already_generated*/, std::string & str, unsigned int width) const
+  {
+    str += generate_value_kernel_argument(utils::append_width(scalartype_, width), name_);
+    return str;
+  }
+};
+
+}
+
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp
new file mode 100644
index 0000000..1f082ac
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp
@@ -0,0 +1,859 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_MATRIX_PRODUCT_HPP
+#define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_MATRIX_PRODUCT_HPP
+
+/* =========================================================================
+Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                ViennaCL - The Vienna Computing Library
+                            -----------------
+
+Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+(A list of authors and contributors can be found in the manual)
+
+License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/templates/matrix_product_template.hpp
+*
+* Kernel template for the matrix product operation
+*/
+
+#include <vector>
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/detail/matrix_def.hpp"
+#include "viennacl/matrix_proxy.hpp"
+
+#include "viennacl/device_specific/templates/template_base.hpp"
+#include "viennacl/device_specific/mapped_objects.hpp"
+#include "viennacl/device_specific/utils.hpp"
+#include "viennacl/device_specific/tree_parsing.hpp"
+#include "viennacl/forwards.h"
+
+#include "viennacl/tools/tools.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+
+struct matrix_product_parameters : public template_base::parameters_type
+{
+  matrix_product_parameters(unsigned int simd_width
+                            , unsigned int local_size_0, unsigned int KL, unsigned int local_size_1
+                            , unsigned int ms, unsigned int ks, unsigned int ns
+                            , fetching_policy_type A_fetching_policy_param, fetching_policy_type B_fetching_policy_param
+                            , unsigned int local_fetch_0_param, unsigned int local_fetch_1_param): template_base::parameters_type(simd_width, local_size_0, local_size_1, 1),
+    kL(KL), mS(ms), kS(ks), nS(ns), A_fetching_policy(A_fetching_policy_param), B_fetching_policy(B_fetching_policy_param),
+    local_fetch_0(local_fetch_0_param), local_fetch_1(local_fetch_1_param),
+    mL(ms*local_size_0), nL(ns*local_size_1){}
+
+  unsigned int kL;
+
+  unsigned int mS;
+  unsigned int kS;
+  unsigned int nS;
+
+  fetching_policy_type A_fetching_policy;
+  fetching_policy_type B_fetching_policy;
+
+  unsigned int local_fetch_0;
+  unsigned int local_fetch_1;
+
+  unsigned int mL;
+  unsigned int nL;
+};
+
+class matrix_product_template : public template_base_impl<matrix_product_template, matrix_product_parameters>
+{
+
+private:
+  unsigned int n_lmem_elements() const
+  {
+    unsigned int N = 0;
+    if (p_.A_fetching_policy==FETCH_FROM_LOCAL)
+      N += p_.kL * (p_.mL+1);
+    if (p_.B_fetching_policy==FETCH_FROM_LOCAL)
+      N += p_.nL * (p_.kL+1);
+    return N;
+  }
+
+  int check_invalid_impl(viennacl::ocl::device const & /*device*/) const
+  {
+    if (p_.A_fetching_policy!=FETCH_FROM_LOCAL && p_.B_fetching_policy!=FETCH_FROM_LOCAL&& (p_.local_fetch_0!=0 || p_.local_fetch_1!=0))
+      return TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH;
+
+    if ((p_.mS % p_.simd_width) > 0 || (p_.nS % p_.simd_width) > 0)
+      return TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE;
+
+    if (p_.kS > p_.kL)
+      return TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL;
+
+    if (!(A_trans_=='N' && B_trans_=='T') && p_.simd_width>1)
+      return TEMPLATE_SIMD_WIDTH_MUST_BE_ONE;
+
+    if (p_.A_fetching_policy==FETCH_FROM_LOCAL || p_.B_fetching_policy==FETCH_FROM_LOCAL)
+    {
+      if ((p_.local_fetch_0*p_.local_fetch_1) !=(p_.local_size_0*p_.local_size_1))
+        return TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT;
+    }
+
+    if (p_.A_fetching_policy==FETCH_FROM_LOCAL)
+    {
+      unsigned int bound1 = (A_trans_=='N')?p_.kL:p_.mL;
+      unsigned int bound0 = (A_trans_=='N')?p_.mL:p_.kL;
+
+      if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0)
+        return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
+
+      if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 0)
+        return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE:TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE;
+
+    }
+    if (p_.B_fetching_policy==FETCH_FROM_LOCAL)
+    {
+      unsigned int bound1 = (B_trans_=='T')?p_.kL:p_.nL;
+      unsigned int bound0 = (B_trans_=='T')?p_.nL:p_.kL;
+
+      if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0)
+        return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
+
+      if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 0)
+        return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
+
+    }
+
+    return TEMPLATE_VALID;
+  }
+
+  static void parse(scheduler::statement const & s,
+                    vcl_size_t & C_idx, leaf_t & C_leaf, vcl_size_t & alpha_idx, leaf_t & alpha_leaf,
+                    vcl_size_t & A_idx, leaf_t & A_leaf, bool& A_trans, vcl_size_t & B_idx, leaf_t & B_leaf, bool& B_trans,
+                    vcl_size_t & beta_idx, leaf_t & beta_leaf)
+  {
+    using namespace tree_parsing;
+    using namespace scheduler;
+
+    scheduler::statement::container_type const & array = s.array();
+    vcl_size_t root_idx = s.root();
+
+    C_idx = root_idx;
+    C_leaf = LHS_NODE_TYPE;
+
+    vcl_size_t node_add_idx = array[root_idx].rhs.node_index;
+
+    vcl_size_t node_1_idx = array[node_add_idx].lhs.node_index;
+    alpha_idx = node_1_idx;
+    alpha_leaf = RHS_NODE_TYPE;
+
+    vcl_size_t mat_prod_idx = array[node_1_idx].lhs.node_index;
+    if (array[mat_prod_idx].lhs.type_family==MATRIX_TYPE_FAMILY)
+    {
+      A_trans = false;
+      A_idx = mat_prod_idx;
+    }
+    else
+    {
+      A_trans = true;
+      A_idx = array[mat_prod_idx].lhs.node_index;
+    }
+    A_leaf = LHS_NODE_TYPE;
+
+    if (array[mat_prod_idx].rhs.type_family==MATRIX_TYPE_FAMILY)
+    {
+      B_trans = false;
+      B_idx = mat_prod_idx;
+      B_leaf = RHS_NODE_TYPE;
+    }
+    else
+    {
+      B_trans = true;
+      B_idx = array[mat_prod_idx].rhs.node_index;
+      B_leaf = LHS_NODE_TYPE;
+    }
+
+    vcl_size_t node_2_idx = array[node_add_idx].rhs.node_index;
+    beta_idx = node_2_idx;
+    beta_leaf = RHS_NODE_TYPE;
+  }
+
+  void VIENNACL_HANDLE_BOUNDS(bool fallback, utils::kernel_generation_stream & stream, std::string const & inbounds, std::string const & do_if, std::string do_else) const
+  {
+    if (fallback)
+    {
+      stream << "if (" << inbounds << ")" << std::endl;
+      stream.inc_tab();
+      stream << do_if << ";" << std::endl;
+      stream.dec_tab();
+      stream << "else" << std::endl;
+      stream.inc_tab();
+      stream << do_else << ";" << std::endl;
+      stream.dec_tab();
+    }
+    else
+      stream << do_if << ";" << std::endl;
+  }
+
+
+  std::string generate_impl(const std::string &kernel_prefix, const statements_container &statements, const std::vector<mapping_type> &mappings, bool fallback) const
+  {
+    using std::string;
+    using tools::to_string;
+
+    parameters_type pfallback(1, p_.local_size_0, p_.kL, p_.local_size_1, p_.mS, 1, p_.nS, p_.A_fetching_policy, p_.B_fetching_policy, p_.local_fetch_0, p_.local_fetch_1);
+    parameters_type const & p = fallback?pfallback:p_;
+
+#define VIENNACL_MUL_STRIDE1 string(fallback?"*#stride1":"")
+#define VIENNACL_HANDLE_BOUNDS(in_bounds, to_load) (!fallback?string(to_load):string( string(in_bounds) + "?" + string(to_load) + ":0"))
+#define VIENNACL_VSTORE(value, offset, ptr) vstore(p.simd_width, value, offset, ptr)
+
+    string widthstr = tools::to_string(p.simd_width);
+
+    //////////////////
+    /// INIT
+    /// //////////////
+    utils::kernel_generation_stream stream;
+    scheduler::statement const & st = statements.data().front();
+    mapping_type const & mapping = mappings.front();
+
+    bool A_trans = false, B_trans = false;
+    vcl_size_t C_idx=0, alpha_idx=0, A_idx=0, B_idx=0, beta_idx=0;
+    leaf_t C_leaf=LHS_NODE_TYPE, alpha_leaf=LHS_NODE_TYPE, A_leaf=LHS_NODE_TYPE, B_leaf=LHS_NODE_TYPE, beta_leaf=LHS_NODE_TYPE;
+    parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, B_idx, B_leaf, B_trans, beta_idx, beta_leaf);
+
+    mapped_matrix      * C     = (mapped_matrix*     )at(mapping, mapping_key(    C_idx,     C_leaf)).get();
+    mapped_host_scalar * alpha = (mapped_host_scalar*)at(mapping, mapping_key(alpha_idx, alpha_leaf)).get();
+    mapped_matrix      * A     = (mapped_matrix*     )at(mapping, mapping_key(    A_idx,     A_leaf)).get();
+    mapped_matrix      * B     = (mapped_matrix*     )at(mapping, mapping_key(    B_idx,     B_leaf)).get();
+    mapped_host_scalar * beta  = (mapped_host_scalar*)at(mapping, mapping_key( beta_idx,  beta_leaf)).get();
+
+    //////////////////
+    /// DECLARATIONS
+    /// //////////////
+
+    stream << " __attribute__((reqd_work_group_size(" << p.local_size_0 << "," << p.local_size_1 << ",1)))" << std::endl;
+    std::map<std::string, unsigned int> widths;
+    widths[A->name()] = p.simd_width;
+    widths[B->name()] = p.simd_width;
+    generate_prototype(stream, kernel_prefix, "unsigned int M, unsigned int N, unsigned int K, ", mappings, statements, widths);
+    stream << "{" << std::endl;
+    stream.inc_tab();
+    if(!fallback)
+    {
+      stream << A->process("#start1 /= " + to_string(p.simd_width) + ";") << std::endl;
+      stream << A->process("#ld /= " + to_string(p.simd_width) + ";") << std::endl;
+      stream << B->process("#start1/= "  + to_string(p.simd_width) + ";") << std::endl;
+      stream << B->process("#ld /= " + to_string(p.simd_width) + ";") << std::endl;
+    }
+    tree_parsing::process(stream, PARENT_NODE_TYPE, "matrix", "#pointer += $OFFSET{#start1, #start2};", statements, mappings);
+    tree_parsing::process(stream, PARENT_NODE_TYPE, "matrix", "#ld *= #nldstride;", statements, mappings);
+
+    ///Result Values
+    stream << C->process("#scalartype rC[" + to_string(p.mS) + "][" + to_string(p.nS) + "] = {{(#scalartype)0}};") << std::endl;
+    if (p.A_fetching_policy==FETCH_FROM_LOCAL)
+      stream << A->process("#scalartype rA[" + to_string(p.kS) + "][" + to_string(p.mS) + "];") << std::endl;
+    else
+      stream << A->process(utils::append_width("#scalartype",p.simd_width) + " rA[" + to_string(p.kS) + "][" + to_string(p.mS/p.simd_width) + "];") << std::endl;
+    if (p.B_fetching_policy==FETCH_FROM_LOCAL)
+      stream << B->process("#scalartype rB[" + to_string(p.kS) + "][" + to_string(p.nS) + "];");
+    else
+      stream << B->process(utils::append_width("#scalartype",p.simd_width) + " rB[" + to_string(p.kS) + "][" + to_string(p.nS/p.simd_width) + "];") << std::endl;
+
+
+    if (p.A_fetching_policy==FETCH_FROM_LOCAL)
+      stream << A->process("__local #scalartype lA[" + to_string(p.kL*(p.mL+1)) + "];");
+    if (p.B_fetching_policy==FETCH_FROM_LOCAL)
+      stream << B->process("__local #scalartype lB[" + to_string(p.kL*(p.nL+1)) + "];");
+    stream << std::endl;
+
+    stream << "size_t gidx = get_group_id(0);" << std::endl;
+    stream << "size_t gidy = get_group_id(1);" << std::endl;
+    stream << "size_t idx = get_local_id(0);" << std::endl;
+    stream << "size_t idy = get_local_id(1);" << std::endl;
+
+    if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy==FETCH_FROM_LOCAL)
+    {
+      stream << std::endl;
+      stream << "size_t idt = " << p.local_size_0 << "*idy + idx;" << std::endl;
+      stream << "size_t idxT = idt % " << p.local_fetch_0 << ";" << std::endl;
+      stream << "size_t idyT = idt / " << p.local_fetch_0 << ";" << std::endl;
+    }
+    stream << std::endl;
+
+    if (fallback)
+    {
+      //Bounds checking for M (in A, C)
+      stream << "bool in_bounds_m[" << p.mS << "];" << std::endl;
+      stream << "for(size_t m = 0; m < " << p.mS << "; m++)" << std::endl;
+      stream.inc_tab();
+      switch (p.A_fetching_policy)
+      {
+      case FETCH_FROM_GLOBAL_CONTIGUOUS:
+        stream << "in_bounds_m[m] = gidx*" << p.mL << " + idx*" << p.mS << " + m < M;" << std::endl;
+        break;
+      default:
+        stream << "in_bounds_m[m] = gidx*" << p.mL << " + idx + m*" << p.local_size_0 << " < M;" << std::endl;
+        break;
+      }
+      stream.dec_tab();
+
+      //Bounds checking for A if Local
+      if (p.A_fetching_policy==FETCH_FROM_LOCAL)
+      {
+        unsigned int fetch_size = (A_trans_=='N'?p.local_fetch_0*p.simd_width:p.local_fetch_1);
+        stream << "bool in_bounds_m_local[" << p.mL/fetch_size << "];" << std::endl;
+        stream << "for(size_t m = 0; m < " << p.mL/fetch_size << "; m++)" << std::endl;
+        stream.inc_tab();
+        stream << "in_bounds_m_local[m] = gidx*" << p.mL << " + " << (A_trans_=='N'?"idxT":"idyT") << " + m*" << fetch_size << " < M;" << std::endl;
+        stream.dec_tab();
+      }
+
+      //Bounds checking for N (in B, C)
+      stream << "bool in_bounds_n[" << p.nS << "];" << std::endl;
+      stream << "for(size_t n = 0; n < " << p.nS << "; n++)" << std::endl;
+      stream.inc_tab();
+      switch (p.B_fetching_policy)
+      {
+      case FETCH_FROM_GLOBAL_CONTIGUOUS:
+        stream << "in_bounds_n[n] = gidy*" << p.nL << " + idy*" << p.nS << " + n < N;" << std::endl;
+        break;
+      default:
+        stream << "in_bounds_n[n] = gidy*" << p.nL << " + idy + n*" << p.local_size_1 << " < N;" << std::endl;
+        break;
+      }
+      stream.dec_tab();
+
+      //Bounds checking for B if Local
+      if (p.B_fetching_policy==FETCH_FROM_LOCAL)
+      {
+        unsigned int fetch_size = (B_trans_=='T'?p.local_fetch_0*p.simd_width:p.local_fetch_1);
+        stream << "bool in_bounds_n_local[" << p.nL/fetch_size << "];" << std::endl;
+        stream << "for(size_t n = 0; n < " <<  p.nL/fetch_size << "; n++)" << std::endl;
+        stream.inc_tab();
+        stream << "in_bounds_n_local[n] = gidy*" << p.nL << " + " << (B_trans_=='T'?"idxT":"idyT") << " + n*" << fetch_size << " < N;" << std::endl;
+        stream.dec_tab();
+      }
+    }
+
+    switch (p.A_fetching_policy)
+    {
+    case FETCH_FROM_LOCAL:
+      if (A_trans_=='N')
+        stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + " + idxT)" + VIENNACL_MUL_STRIDE1 + " + idyT*#ld;") << std::endl;
+      else
+        stream << A->process("#pointer += idxT" + VIENNACL_MUL_STRIDE1 + " + gidx*" + to_string(p.mL/p.simd_width) + "*#ld + idyT*#ld;") << std::endl;
+      break;
+
+    case FETCH_FROM_GLOBAL_CONTIGUOUS:
+      if (A_trans_=='N')
+        stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx*" + to_string(p.mS/p.simd_width) + ")" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+      else
+        stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx*" + to_string(p.mS/p.simd_width) + ")*#ld;") << std::endl;
+      break;
+
+    case FETCH_FROM_GLOBAL_STRIDED:
+      if (A_trans_=='N')
+        stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx" + ")" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+      else
+        stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx)*#ld;") << std::endl;
+      break;
+
+    //default: break;
+    }
+
+    switch (p.B_fetching_policy)
+    {
+    case FETCH_FROM_LOCAL:
+      if (B_trans_=='T')
+        stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + " + idxT" + ")" + VIENNACL_MUL_STRIDE1 + " + idyT*#ld;") << std::endl;
+      else
+        stream << B->process("#pointer += idxT" + VIENNACL_MUL_STRIDE1 + " + gidy*" + to_string(p.nL/p.simd_width) + "*#ld + idyT*#ld;") << std::endl;
+      break;
+
+    case FETCH_FROM_GLOBAL_CONTIGUOUS:
+      if (B_trans_=='T')
+        stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy*" + to_string(p.nS/p.simd_width) + ")" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+      else
+        stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy*" + to_string(p.nS/p.simd_width) + ")*#ld;") << std::endl;
+      break;
+
+    case FETCH_FROM_GLOBAL_STRIDED:
+      if (B_trans_=='T')
+        stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy" + ")" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+      else
+        stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy)*#ld;") << std::endl;
+      break;
+
+    //default: break;
+    }
+
+    stream << std::endl;
+    stream << "size_t K_size_t = K;" << std::endl;
+    stream << "for(size_t block_k=0; block_k < K_size_t; block_k+=" << p.kL << "){" << std::endl;
+    stream.inc_tab();
+
+    if (p.A_fetching_policy==FETCH_FROM_LOCAL)
+    {
+      if (A_trans_=='N')
+        stream << A->process("__local #scalartype* plA = lA + idyT*" + to_string(p.mL + 1) + " + " + to_string(p.simd_width) + "*idxT;") << std::endl;
+      else
+        stream << A->process("__local #scalartype* plA = lA + idxT*" + to_string(p.mL + 1) + " + idyT;") << std::endl;
+    }
+
+
+    if (p.B_fetching_policy==FETCH_FROM_LOCAL)
+    {
+      if (B_trans_=='T')
+        stream  << B->process("__local #scalartype* plB = lB + idyT*" + to_string(p.nL+1) + " + " + to_string(p.simd_width) + "*idxT;") << std::endl;
+      else
+        stream << B->process("__local #scalartype* plB = lB + idxT*" + to_string(p.nL+1) + "+ idyT;") <<std::endl;
+    }
+
+
+    if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy==FETCH_FROM_LOCAL)
+      stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
+
+    ///Fetch LHS to Local Memory
+    if (p.A_fetching_policy==FETCH_FROM_LOCAL && A_trans_=='N')
+      for (unsigned int k = 0; k < p.kL; k += p.local_fetch_1)
+        for (unsigned int m = 0; m < p.mL; m += p.local_fetch_0*p.simd_width)
+        {
+          string in_bounds = "in_bounds_m_local[" + to_string(m/(p.local_fetch_0*p.simd_width)) + "]";
+          string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(m/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]";
+          stream << A->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", "plA + " + to_string(k*(p.mL+1)+m))) << ";" << std::endl;
+        }
+    else if (p.A_fetching_policy==FETCH_FROM_LOCAL && A_trans_=='T')
+      for (unsigned int k = 0; k < p.mL; k += p.local_fetch_1)
+        for (unsigned int m = 0; m < p.kL; m += p.local_fetch_0*p.simd_width)
+        {
+          string in_bounds = "in_bounds_m_local[" + to_string(k/p.local_fetch_1) + "]";
+          string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(m/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]";
+          stream << A->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", "plA + " + to_string(m*(p.mL+1)+k))) << ";" << std::endl;
+        }
+
+    if (p.B_fetching_policy==FETCH_FROM_LOCAL && B_trans_=='T')
+      for (unsigned int k = 0; k < p.kL; k += p.local_fetch_1)
+        for (unsigned int n = 0; n < p.nL; n += p.local_fetch_0*p.simd_width)
+        {
+          string in_bounds = "in_bounds_n_local[" + to_string(n/(p.local_fetch_0*p.simd_width)) + "]";
+          string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(n/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]";
+          stream << B->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", "plB + " + to_string(k*(p.nL+1)+n))) << ";" << std::endl;
+        }
+    else if (p.B_fetching_policy==FETCH_FROM_LOCAL && B_trans_=='N')
+      for (unsigned int k = 0; k < p.nL; k += p.local_fetch_1)
+        for (unsigned int n = 0; n < p.kL; n += p.local_fetch_0*p.simd_width)
+        {
+          string in_bounds = "in_bounds_n_local[" + to_string(k/p.local_fetch_1) + "]";
+          string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(n/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]";
+          stream << B->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", "plB + " + to_string(n*(p.nL+1)+k))) << ";" << std::endl;
+        }
+
+    if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy == FETCH_FROM_LOCAL)
+    {
+      stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
+      stream << "size_t offA = " << p.simd_width << "*idx;" << std::endl;
+      stream << "size_t offB = " << p.simd_width << "*idy;" << std::endl;
+    }
+
+    if (fallback)
+      stream << "for(size_t k = 0; k < " << p.kL << " && (block_k + k < K_size_t); k+=" << p.kS << "){" << std::endl;
+    else
+      stream << "for(size_t k = 0; k < " << p.kL << "; k+=" << p.kS << "){" << std::endl;
+    stream.inc_tab();
+
+    ///Fetch LHS to registers
+    stream << "#pragma unroll " << p.kS <<  std::endl;
+    stream << "for(size_t kk = 0; kk < " << p.kS << "; kk++)" << std::endl;
+    stream << "#pragma unroll " << p.mS/p.simd_width << std::endl;
+    stream << "for(size_t mm = 0; mm < " << p.mS/p.simd_width << "; mm++)" << std::endl;
+    stream << "{" << std::endl;
+    stream.inc_tab();
+    switch (p.A_fetching_policy)
+    {
+    case FETCH_FROM_LOCAL:
+      for (unsigned int ss = 0; ss < p.simd_width; ++ss)
+        stream << "rA[kk][mm*" << p.simd_width << "+" << ss << "] = lA[offA + mm*" << p.local_size_0*p.simd_width << "+" << ss << "+ kk*" << (p.mL+1) << "];" << std::endl;
+      break;
+
+    case FETCH_FROM_GLOBAL_CONTIGUOUS:
+    {
+      if (A_trans_=='N')
+        stream << "rA[kk][mm] = " << A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[kk*#ld + mm" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+      else
+        stream << "rA[kk][mm] = " << A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[mm*#ld + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+      break;
+    }
+
+    case FETCH_FROM_GLOBAL_STRIDED:
+    {
+      if (A_trans_=='N')
+        stream << "rA[kk][mm] = " << A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[kk*#ld + mm*" + to_string(p.local_size_0) + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+      else
+        stream << "rA[kk][mm] = " << A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[mm*#ld*" + to_string(p.local_size_0) + " + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+      break;
+    }
+
+    //default: break;
+    }
+    stream.dec_tab();
+    stream << "}" << std::endl;
+
+    stream << "#pragma unroll " << p.kS << std::endl;
+    stream << "for(size_t kk = 0; kk < " << p.kS << "; kk++)" << std::endl;
+    stream << "#pragma unroll " << p.nS/p.simd_width << std::endl;
+    stream << "for(size_t nn = 0; nn < " << p.nS/p.simd_width << "; nn++)" << std::endl;
+    stream << "{" << std::endl;
+    stream.inc_tab();
+    switch (p.B_fetching_policy)
+    {
+    case FETCH_FROM_LOCAL:
+      for (unsigned int ss = 0; ss < p.simd_width; ++ss)
+        stream << "rB[kk][nn*" << p.simd_width << "+" << ss << "] = lB[offB + nn*" << p.local_size_1*p.simd_width << "+" << ss  << "+ kk*" << (p.nL+1) << "];" << std::endl;
+      break;
+
+    case FETCH_FROM_GLOBAL_CONTIGUOUS:
+    {
+      if (B_trans_=='T')
+        stream << "rB[kk][nn] = " << B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[kk*#ld + nn" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+      else
+        stream << "rB[kk][nn] = " << B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[nn*#ld + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+      break;
+    }
+
+    case FETCH_FROM_GLOBAL_STRIDED:
+    {
+      if (B_trans_=='T')
+        stream << "rB[kk][nn] = " << B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[kk*#ld + nn*" + to_string(p.local_size_1) + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+      else
+        stream << "rB[kk][nn] = " << B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[nn*#ld*" + to_string(p.local_size_1) + " + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+      break;
+    }
+
+    //default: break;
+    }
+    stream.dec_tab();
+    stream << "}" << std::endl;
+
+
+    ///Increment pointers
+    switch (p.A_fetching_policy)
+    {
+    case FETCH_FROM_LOCAL:
+      stream << "offA += " << p.kS*(p.mL+1) << ";" << std::endl;
+      break;
+
+    default:
+      if (A_trans_=='N')
+        stream << A->process("#pointer += " + to_string(p.kS) + "*#ld;") << std::endl;
+      else
+        stream << A->process("#pointer += " + to_string(p.kS) + "" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+      break;
+    }
+
+
+    switch (p.B_fetching_policy)
+    {
+    case FETCH_FROM_LOCAL:
+      stream << "offB += " << p.kS*(p.nL+1) << ";" << std::endl;
+      break;
+
+    default:
+      if (B_trans_=='T')
+        stream << B->process("#pointer += " + to_string(p.kS) + "*#ld;") << std::endl;
+      else
+        stream << B->process("#pointer += " + to_string(p.kS) + "" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+      break;
+    }
+
+
+    stream << "#pragma unroll " << p.kS << std::endl;
+    stream << "for(size_t kk = 0; kk <" << p.kS << "; ++kk)" << std::endl;
+    stream << "{" << std::endl;
+    stream.inc_tab();
+    for (unsigned int nn=0; nn < p.nS; ++nn)
+      for (unsigned int mm=0; mm < p.mS; ++mm)
+      {
+        string res_str, lhs_str, rhs_str;
+        res_str = "rC[" + tools::to_string(mm) + "][" + tools::to_string(nn) + "]";
+        if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.simd_width==1)
+          lhs_str = "rA[kk][" + tools::to_string(mm) + "]";
+        else
+          lhs_str = "rA[kk][" + tools::to_string(mm/p.simd_width) + "].s" + tools::to_string(mm%p.simd_width);
+        if (p.B_fetching_policy==FETCH_FROM_LOCAL || p.simd_width==1)
+          rhs_str = "rB[kk]["+tools::to_string(nn)+"]";
+        else
+          rhs_str = "rB[kk]["+tools::to_string(nn/p.simd_width)+"].s"+tools::to_string(nn%p.simd_width);
+        stream << res_str << "=" << "fma(" << lhs_str << "," << rhs_str << "," << res_str << ");" << std::endl;
+      }
+    stream.dec_tab();
+    stream << "}" << std::endl;
+
+
+
+
+    stream.dec_tab();
+    stream << "}" << std::endl;
+
+    //Increment global pointer if local memory is used
+    //Else, it's incremented directly when fetching
+    if (p.A_fetching_policy==FETCH_FROM_LOCAL)
+    {
+      if (A_trans_=='N')
+        stream << A->process("#pointer += " + to_string(p.kL) + "*#ld;") << std::endl;
+      else
+        stream << A->process("#pointer += " + to_string(p.kL) + "" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+    }
+
+    if (p.B_fetching_policy==FETCH_FROM_LOCAL)
+    {
+      if (B_trans_=='T')
+        stream << B->process("#pointer += " + to_string(p.kL) + "*#ld;") << std::endl;
+      else
+        stream << B->process("#pointer += " + to_string(p.kL) + "" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+    }
+
+    stream.dec_tab();
+    stream << "}" << std::endl;
+
+
+    if (C->row_major())
+    {
+      unsigned int ministartstride0 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.mS:p.simd_width;
+      unsigned int ministartstride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.nS:p.simd_width;
+
+      stream << C->process("#pointer += gidx*" + to_string(p.mL) + "*#ld;") << std::endl;
+      stream << C->process("#pointer += idx*" + to_string(ministartstride0) + "*#ld;") << std::endl;
+      stream << C->process("#pointer += gidy*" + to_string(p.nL) + "*#stride2;") << std::endl;
+      stream << C->process("#pointer += idy*" + to_string(ministartstride1) + "*#stride2;") << std::endl;
+
+      for (unsigned int n=0; n < p.nS; ++n)
+      {
+        for (unsigned int m=0; m < p.mS; ++m)
+        {
+          unsigned int ministride1 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?1:p.local_size_0;
+          string Cj = to_string((m/p.simd_width)*(ministride1*p.simd_width) + m%p.simd_width);
+          if (fallback)
+          {
+            stream << "if (in_bounds_m[" + to_string(m) + "] && in_bounds_n[" + to_string(n) + "])" << std::endl;
+            stream.inc_tab();
+          }
+          stream << C->process("#pointer[" + Cj + "*#ld] = rC[" + to_string(m) + "][" + to_string(n) + "]*" + alpha->name() + "+ #pointer[" + Cj + "*#ld]*" + beta->name() + ";") << std::endl;
+          if (fallback)
+            stream.dec_tab();
+        }
+        if ((n+1)%p.simd_width>0 || p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
+          stream << C->process("#pointer += #stride2;") << std::endl;
+        else
+          stream << C->process("#pointer += " + to_string((p.local_size_1*p.simd_width) - (p.simd_width-1)) + "*#stride2;") << std::endl;
+      }
+
+    }
+    else
+    {
+      unsigned int ministartstride0 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.mS:p.simd_width;
+      unsigned int ministartstride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.nS:p.simd_width;
+
+      stream << C->process("#pointer += gidx*" + to_string(p.mL) + "*#stride1;") << std::endl;
+      stream << C->process("#pointer += idx*" + to_string(ministartstride0) + "*#stride1;") << std::endl;
+      stream << C->process("#pointer += gidy*" + to_string(p.nL) + "*#ld;") << std::endl;
+      stream << C->process("#pointer += idy*" + to_string(ministartstride1) + "*#ld;") << std::endl;
+
+      for (unsigned int m=0; m < p.mS; ++m)
+      {
+        for (unsigned int n=0; n < p.nS; ++n)
+        {
+          unsigned int ministride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?1:p.local_size_1;
+          string Cj = to_string((n/p.simd_width)*(ministride1*p.simd_width) + n%p.simd_width);
+          if (fallback)
+          {
+            stream << "if (in_bounds_m[" + to_string(m) + "] && in_bounds_n[" + to_string(n) + "])" << std::endl;
+            stream.inc_tab();
+          }
+          stream << C->process("#pointer[" + Cj + "*#ld] = rC[" + to_string(m) + "][" + to_string(n) + "]*" + alpha->name() + " + #pointer[" + Cj + "*#ld]*" + beta->name() + ";") << std::endl;
+          if (fallback)
+            stream.dec_tab();
+        }
+
+        if ((m+1)%p.simd_width>0 || p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
+          stream << C->process("#pointer += #stride1;") << std::endl;
+        else
+          stream << C->process("#pointer += " + to_string((p.local_size_0*p.simd_width) - (p.simd_width-1)) + "*#stride1;") << std::endl;
+      }
+    }
+
+    stream.dec_tab();
+    stream << "}" << std::endl;
+
+    return stream.str();
+
+#undef VIENNACL_MUL_STRIDE1
+#undef VIENNACL_HANDLE_BOUNDS
+#undef VIENNACL_VSTORE
+  }
+
+  std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
+  {
+    std::vector<std::string> res;
+    res.push_back(generate_impl(kernel_prefix, statements, mappings, false));
+    res.push_back(generate_impl(kernel_prefix, statements, mappings, true));
+    return res;
+  }
+
+  template<class NumericT>
+  void enqueue_block(scheduler::statement & statement,
+                     scheduler::lhs_rhs_element& eA, scheduler::lhs_rhs_element& eB, scheduler::lhs_rhs_element& eC, scheduler::lhs_rhs_element& ebeta,
+                     matrix_base<NumericT> const & A, matrix_base<NumericT> const & B, matrix_base<NumericT> const & C, NumericT beta,
+                     std::vector<lazy_program_compiler> & programs, std::string const & kernel_prefix, vcl_size_t id)
+  {
+    if (A.size1()==0 || A.size2()==0 || B.size1()==0 || B.size2()==0 || C.size1()==0 || C.size2()==0)
+      return;
+
+    viennacl::ocl::kernel& kernel = programs[id].program().get_kernel(kernel_prefix);
+
+    kernel.local_work_size(0, p_.local_size_0);
+    kernel.local_work_size(1, p_.local_size_1);
+
+    scheduler::statement::assign_element(eA, A);
+    scheduler::statement::assign_element(eB, B);
+    scheduler::statement::assign_element(eC, C);
+    scheduler::statement::assign_element(ebeta, beta);
+
+    if (id==1)
+    {
+      kernel.global_work_size(0, tools::align_to_multiple(tools::align_to_multiple((unsigned int)C.size1(),p_.mS)/p_.mS, p_.local_size_0));
+      kernel.global_work_size(1, tools::align_to_multiple(tools::align_to_multiple((unsigned int)C.size2(),p_.nS)/p_.nS, p_.local_size_1));
+    }
+    else
+    {
+      kernel.global_work_size(0, C.size1()/p_.mS);
+      kernel.global_work_size(1, C.size2()/p_.nS);
+    }
+    unsigned int current_arg = 0;
+    kernel.arg(current_arg++, cl_uint(C.size1()));
+    kernel.arg(current_arg++, cl_uint(C.size2()));
+    if (A.row_major())
+      kernel.arg(current_arg++, cl_uint(A_trans_=='T'?A.size2():A.size1()));
+    else
+      kernel.arg(current_arg++, cl_uint(A_trans_=='N'?A.size2():A.size1()));
+    set_arguments(statement, kernel, current_arg);
+    viennacl::ocl::enqueue(kernel);
+
+  }
+
+  template<class NumericT>
+  matrix_slice< viennacl::matrix_base<NumericT> >  create_slice(viennacl::matrix_base<NumericT>* scheduler::lhs_rhs_element::*ptr, scheduler::lhs_rhs_element const & element,
+                                                                          vcl_size_t s0_0, vcl_size_t s0_1, vcl_size_t s1_0, vcl_size_t s1_1, bool swap)
+  {
+    matrix_base<NumericT> & M = *(element.*ptr);
+    slice s0(s0_0, 1, s0_1 - s0_0);
+    slice s1(s1_0, 1, s1_1 - s1_0);
+    if (swap)
+      std::swap(s0, s1);
+    return matrix_slice<viennacl::matrix_base<NumericT> >(M, s0, s1);
+  }
+
+  template<class NumericT>
+  void enqueue_impl(viennacl::matrix_base<NumericT>* scheduler::lhs_rhs_element::*ptr_matrix,
+                    scheduler::statement & statement, scheduler::lhs_rhs_element & A, scheduler::lhs_rhs_element & B, scheduler::lhs_rhs_element & C, scheduler::lhs_rhs_element & beta,
+                    NumericT beta_value, std::vector<lazy_program_compiler> & programs, std::string const & kernel_prefix)
+  {
+    using namespace device_specific::utils;
+    vcl_size_t ldstrideA = call_on_matrix(A, leading_stride());
+    vcl_size_t ldstrideB = call_on_matrix(B, leading_stride());
+    vcl_size_t ldstrideC = call_on_matrix(C, leading_stride());
+    vcl_size_t ldstartA = call_on_matrix(A, leading_start());
+    vcl_size_t ldstartB = call_on_matrix(B, leading_start());
+    bool swap_A = ((A_trans_=='T') ^ utils::call_on_matrix(A, row_major_fun()));
+    bool swap_B = ((B_trans_=='T') ^ utils::call_on_matrix(B, row_major_fun()));
+
+    vcl_size_t M = call_on_matrix(C, size1_fun());
+    vcl_size_t N = call_on_matrix(C, size2_fun());
+    vcl_size_t K;
+    if (utils::call_on_matrix(A, row_major_fun()))
+      K = A_trans_=='T'?call_on_matrix(A, size2_fun()):call_on_matrix(A, size1_fun());
+    else
+      K = A_trans_=='N'?call_on_matrix(A, size2_fun()):call_on_matrix(A, size1_fun());
+
+    if (M < p_.mL || N < p_.nL || K < p_.kL || ldstrideA> 1 || ldstrideB > 1 || ldstrideC > 1 ||
+        (p_.simd_width>1 && (ldstartA % p_.simd_width > 0 || ldstartB % p_.simd_width > 0)))
+    {
+      enqueue_block(statement, A, B, C, beta, create_slice(ptr_matrix, A, 0, M, 0, K, swap_A),
+                    create_slice(ptr_matrix, B, 0, K, 0, N,  swap_B),
+                    create_slice(ptr_matrix, C, 0, M, 0, N, false), beta_value, programs, kernel_prefix, 1);
+      return;
+    }
+
+
+    scheduler::lhs_rhs_element Acopy = A;
+    scheduler::lhs_rhs_element Bcopy = B;
+    scheduler::lhs_rhs_element Ccopy = C;
+
+    vcl_size_t lM = M / p_.mL * p_.mL;
+    vcl_size_t lN = N / p_.nL * p_.nL;
+    vcl_size_t lK = K / p_.kL * p_.kL;
+
+
+    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN, false), beta_value, programs, kernel_prefix, 0);
+    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN, false), (NumericT)1, programs, kernel_prefix, 1);
+
+    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N, false), beta_value, programs, kernel_prefix, 1);
+    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N, false), (NumericT)1, programs, kernel_prefix, 1);
+
+    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN, false), beta_value, programs, kernel_prefix, 1);
+    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN, false), (NumericT)1, programs, kernel_prefix, 1);
+
+    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N, false), beta_value, programs, kernel_prefix, 1);
+    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N, false), (NumericT)1, programs, kernel_prefix, 1);
+  }
+
+public:
+  matrix_product_template(matrix_product_template::parameters_type const & parameters, char A_trans, char B_trans) : template_base_impl<matrix_product_template, matrix_product_parameters>(parameters, BIND_ALL_UNIQUE), A_trans_(A_trans), B_trans_(B_trans){ }
+
+  virtual void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
+  {
+    using namespace device_specific::utils;
+    using namespace tree_parsing;
+
+    scheduler::statement const & st = statements.data().front();
+    bool A_trans, B_trans;
+    vcl_size_t C_idx=0, A_idx=0, B_idx=0, alpha_idx=0, beta_idx = 0;
+    leaf_t C_leaf=LHS_NODE_TYPE, A_leaf=LHS_NODE_TYPE, B_leaf=LHS_NODE_TYPE, alpha_leaf=LHS_NODE_TYPE, beta_leaf=LHS_NODE_TYPE;
+    parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, B_idx, B_leaf, B_trans, beta_idx, beta_leaf);
+
+    scheduler::statement stcopy = st;
+    scheduler::lhs_rhs_element& A = utils::lhs_rhs_element(stcopy, A_idx, A_leaf);
+    scheduler::lhs_rhs_element& B = utils::lhs_rhs_element(stcopy, B_idx, B_leaf);
+    scheduler::lhs_rhs_element& C = utils::lhs_rhs_element(stcopy, C_idx, C_leaf);
+    scheduler::lhs_rhs_element& beta = utils::lhs_rhs_element(stcopy, beta_idx, beta_leaf);
+
+
+
+
+
+
+    if (C.numeric_type==scheduler::FLOAT_TYPE)
+      enqueue_impl<float>(&scheduler::lhs_rhs_element::matrix_float, stcopy, A, B, C, beta, beta.host_float, programs, kernel_prefix);
+    else if (C.numeric_type==scheduler::DOUBLE_TYPE)
+      enqueue_impl<double>(&scheduler::lhs_rhs_element::matrix_double, stcopy, A, B, C, beta, beta.host_double, programs, kernel_prefix);
+    else
+      throw generator_not_supported_exception("GEMM only supported for float/double");
+
+  }
+
+private:
+  const char A_trans_;
+  const char B_trans_;
+};
+
+}
+
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp
new file mode 100644
index 0000000..40e3168
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp
@@ -0,0 +1,596 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_TEMPLATE_BASE_
+#define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_TEMPLATE_BASE_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/templates/template_base.hpp
+ *
+ * Base classes for the profiles
+*/
+
+#include <list>
+#include <set>
+
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/device_utils.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/scheduler/io.hpp"
+
+#include "viennacl/device_specific/lazy_program_compiler.hpp"
+#include "viennacl/device_specific/mapped_objects.hpp"
+#include "viennacl/device_specific/tree_parsing.hpp"
+#include "viennacl/device_specific/utils.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+
+enum fetching_policy_type
+{
+  FETCH_FROM_LOCAL,
+  FETCH_FROM_GLOBAL_STRIDED,
+  FETCH_FROM_GLOBAL_CONTIGUOUS
+};
+
+class template_base
+{
+public:
+  struct parameters_type
+  {
+    parameters_type(unsigned int _simd_width, unsigned int _local_size_1, unsigned int _local_size_2, unsigned int _num_kernels) : simd_width(_simd_width), local_size_0(_local_size_1), local_size_1(_local_size_2), num_kernels(_num_kernels){ }
+
+    unsigned int simd_width;
+    unsigned int local_size_0;
+    unsigned int local_size_1;
+    unsigned int num_kernels;
+  };
+
+private:
+  /** @brief Functor to map the statements to the types defined in mapped_objects.hpp */
+  class map_functor : public tree_parsing::traversal_functor
+  {
+
+    scheduler::statement_node_numeric_type numeric_type(scheduler::statement const * statement, vcl_size_t root_idx) const
+    {
+      scheduler::statement_node const * root_node = &statement->array()[root_idx];
+      while (root_node->lhs.numeric_type==scheduler::INVALID_NUMERIC_TYPE)
+        root_node = &statement->array()[root_node->lhs.node_index];
+      return root_node->lhs.numeric_type;
+    }
+
+  public:
+    typedef tools::shared_ptr<mapped_object> result_type;
+
+    map_functor(symbolic_binder & binder, mapping_type & mapping) : binder_(binder), mapping_(mapping){ }
+
+    /** @brief Binary leaf */
+    template<class T>
+    result_type binary_leaf(scheduler::statement const * statement, vcl_size_t root_idx, mapping_type const * mapping) const
+    {
+      return result_type(new T(utils::numeric_type_to_string(numeric_type(statement,root_idx)), binder_.get(NULL), mapped_object::node_info(mapping, statement, root_idx)));
+    }
+
+    template<class NumericT>
+    result_type operator()(NumericT const & /*scalar*/) const
+    {
+      return result_type(new mapped_host_scalar(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
+    }
+
+    /** @brief Scalar mapping */
+    template<class NumericT>
+    result_type operator()(scalar<NumericT> const & scal) const
+    {
+      return result_type(new mapped_scalar(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(scal))));
+    }
+
+    /** @brief Vector mapping */
+    template<class NumericT>
+    result_type operator()(vector_base<NumericT> const & vec) const
+    {
+      return result_type(new mapped_vector(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(vec))));
+    }
+
+    /** @brief Implicit vector mapping */
+    template<class NumericT>
+    result_type operator()(implicit_vector_base<NumericT> const & /*vec*/) const
+    {
+      return result_type(new mapped_implicit_vector(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
+    }
+
+    /** @brief Matrix mapping */
+    template<class NumericT>
+    result_type operator()(matrix_base<NumericT> const & mat) const
+    {
+      return result_type(new mapped_matrix(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(mat)),
+                                           viennacl::traits::row_major(mat)));
+    }
+
+    /** @brief Implicit matrix mapping */
+    template<class NumericT>
+    result_type operator()(implicit_matrix_base<NumericT> const & /*mat*/) const
+    {
+      return result_type(new mapped_implicit_matrix(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
+    }
+
+    /** @brief Traversal functor */
+    void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t leaf_t) const {
+      mapping_type::key_type key(root_idx, leaf_t);
+      scheduler::statement_node const & root_node = statement.array()[root_idx];
+
+      if (leaf_t == LHS_NODE_TYPE && root_node.lhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+        mapping_.insert(mapping_type::value_type(key, utils::call_on_element(root_node.lhs, *this)));
+      else if (leaf_t == RHS_NODE_TYPE && root_node.rhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+        mapping_.insert(mapping_type::value_type(key,  utils::call_on_element(root_node.rhs, *this)));
+      else if ( leaf_t== PARENT_NODE_TYPE)
+      {
+        if (root_node.op.type==scheduler::OPERATION_BINARY_VECTOR_DIAG_TYPE)
+          mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_vector_diag>(&statement, root_idx, &mapping_)));
+        else if (root_node.op.type==scheduler::OPERATION_BINARY_MATRIX_DIAG_TYPE)
+          mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_diag>(&statement, root_idx, &mapping_)));
+        else if (root_node.op.type==scheduler::OPERATION_BINARY_MATRIX_ROW_TYPE)
+          mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_row>(&statement, root_idx, &mapping_)));
+        else if (root_node.op.type==scheduler::OPERATION_BINARY_MATRIX_COLUMN_TYPE)
+          mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_column>(&statement, root_idx, &mapping_)));
+        else if (is_scalar_reduction(root_node))
+          mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_scalar_reduction>(&statement, root_idx, &mapping_)));
+        else if (is_vector_reduction(root_node))
+          mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_row_wise_reduction>(&statement, root_idx, &mapping_)));
+        else if (root_node.op.type == scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE)
+          mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_product>(&statement, root_idx, &mapping_)));
+        else if (root_node.op.type == scheduler::OPERATION_UNARY_TRANS_TYPE)
+          mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_trans>(&statement, root_idx, &mapping_)));
+      }
+    }
+
+  private:
+    symbolic_binder & binder_;
+    mapping_type & mapping_;
+  };
+
+  /** @brief functor for generating the prototype of a statement */
+  class prototype_generation_traversal : public tree_parsing::traversal_functor
+  {
+  private:
+    std::set<std::string> & already_generated_;
+    std::string & str_;
+    mapping_type const & mapping_;
+    std::map<std::string, unsigned int> const & widths_;
+  public:
+    prototype_generation_traversal(std::set<std::string> & already_generated, std::string & str, mapping_type const & mapping, std::map<std::string, unsigned int> const & widths) :
+      already_generated_(already_generated), str_(str),  mapping_(mapping), widths_(widths){ }
+
+    void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t leaf) const
+    {
+      scheduler::statement_node const & root_node = statement.array()[root_idx];
+      if ( (leaf==LHS_NODE_TYPE && root_node.lhs.type_family!=scheduler::COMPOSITE_OPERATION_FAMILY)
+           ||(leaf==RHS_NODE_TYPE && root_node.rhs.type_family!=scheduler::COMPOSITE_OPERATION_FAMILY) )
+      {
+        mapped_object * obj = at(mapping_, std::make_pair(root_idx,leaf)).get();
+        if(widths_.find(obj->name())!=widths_.end())
+          obj->append_kernel_arguments(already_generated_, str_, at(widths_, obj->name()));
+        else
+          obj->append_kernel_arguments(already_generated_, str_, 1);
+      }
+    }
+  };
+
+
+
+  /** @brief functor for setting the arguments of a kernel */
+  class set_arguments_functor : public tree_parsing::traversal_functor
+  {
+  public:
+    typedef void result_type;
+
+    set_arguments_functor(symbolic_binder & binder, unsigned int & current_arg, viennacl::ocl::kernel & kernel) : binder_(binder), current_arg_(current_arg), kernel_(kernel){ }
+
+    template<class NumericT>
+    result_type operator()(NumericT const & scal) const {
+      typedef typename viennacl::result_of::cl_type<NumericT>::type cl_scalartype;
+      kernel_.arg(current_arg_++, cl_scalartype(scal));
+    }
+
+    /** @brief Scalar mapping */
+    template<class NumericT>
+    result_type operator()(scalar<NumericT> const & scal) const {
+      if (binder_.bind(&viennacl::traits::handle(scal)))
+        kernel_.arg(current_arg_++, scal.handle().opencl_handle());
+    }
+
+    /** @brief Vector mapping */
+    template<class NumericT>
+    result_type operator()(vector_base<NumericT> const & vec) const {
+      if (binder_.bind(&viennacl::traits::handle(vec)))
+      {
+        kernel_.arg(current_arg_++, vec.handle().opencl_handle());
+        kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start(vec)));
+        kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride(vec)));
+      }
+    }
+
+    /** @brief Implicit vector mapping */
+    template<class NumericT>
+    result_type operator()(implicit_vector_base<NumericT> const & vec) const
+    {
+      typedef typename viennacl::result_of::cl_type<NumericT>::type cl_scalartype;
+      kernel_.arg(current_arg_++, cl_scalartype(vec.value()));
+      if (vec.has_index())
+        kernel_.arg(current_arg_++, cl_uint(vec.index()));
+    }
+
+    /** @brief Matrix mapping */
+    template<class NumericT>
+    result_type operator()(matrix_base<NumericT> const & mat) const
+    {
+      if (binder_.bind(&viennacl::traits::handle(mat)))
+      {
+        kernel_.arg(current_arg_++, mat.handle().opencl_handle());
+        kernel_.arg(current_arg_++, cl_uint(viennacl::traits::ld(mat)));
+        if (mat.row_major())
+        {
+          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start2(mat)));
+          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start1(mat)));
+          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride2(mat)));
+          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride1(mat)));
+        }
+        else
+        {
+          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start1(mat)));
+          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start2(mat)));
+          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride1(mat)));
+          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride2(mat)));
+        }
+      }
+    }
+
+    /** @brief Implicit matrix mapping */
+    template<class NumericT>
+    result_type operator()(implicit_matrix_base<NumericT> const & mat) const
+    {
+      kernel_.arg(current_arg_++, typename viennacl::result_of::cl_type<NumericT>::type(mat.value()));
+    }
+
+    /** @brief Traversal functor: */
+    void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t leaf_t) const
+    {
+      scheduler::statement_node const & root_node = statement.array()[root_idx];
+      if (leaf_t==LHS_NODE_TYPE && root_node.lhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+        utils::call_on_element(root_node.lhs, *this);
+      else if (leaf_t==RHS_NODE_TYPE && root_node.rhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+        utils::call_on_element(root_node.rhs, *this);
+    }
+
+  private:
+    symbolic_binder & binder_;
+    unsigned int & current_arg_;
+    viennacl::ocl::kernel & kernel_;
+  };
+
+protected:
+
+  static void generate_prototype(utils::kernel_generation_stream & stream, std::string const & name, std::string const & first_arguments, std::vector<mapping_type> const & mappings, statements_container const &statements,
+                                 std::map<std::string, unsigned int> const & widths)
+  {
+    statements_container::data_type::const_iterator sit;
+    std::vector<mapping_type>::const_iterator mit;
+    std::set<std::string> already_generated;
+
+    std::string arguments = first_arguments;
+    for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++sit, ++mit)
+      tree_parsing::traverse(*sit, sit->root(), prototype_generation_traversal(already_generated, arguments, *mit, widths), true);
+    arguments.erase(arguments.size()-1); //Last comma pruned
+    stream << "__kernel " << "void " << name << "(" << arguments << ")" << std::endl;
+  }
+
+  static void generate_prototype(utils::kernel_generation_stream & stream, std::string const & name, std::string const & first_arguments, std::vector<mapping_type> const & mappings, statements_container const & statements)
+  {
+    generate_prototype(stream, name, first_arguments, mappings, statements, std::map<std::string, unsigned int>());
+  }
+
+  void set_arguments(statements_container const & statements, viennacl::ocl::kernel & kernel, unsigned int & current_arg)
+  {
+    tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy_);
+    for (statements_container::data_type::const_iterator itt = statements.data().begin(); itt != statements.data().end(); ++itt)
+      tree_parsing::traverse(*itt, itt->root(), set_arguments_functor(*binder,current_arg,kernel), true);
+  }
+
+  class invalid_template_exception : public std::exception
+  {
+  public:
+    invalid_template_exception() : message_() {}
+    invalid_template_exception(std::string message) :
+      message_("ViennaCL: Internal error: The generator cannot apply the given template to the given statement: " + message + "\n"
+               "If you are using a builtin template, please report on viennacl-support@lists.sourceforge.net! We will provide a fix as soon as possible\n"
+               "If you are using your own template, please try using other parameters") {}
+    virtual const char* what() const throw() { return message_.c_str(); }
+    virtual ~invalid_template_exception() throw() {}
+  private:
+    std::string message_;
+  };
+
+  static void fetching_loop_info(fetching_policy_type policy, std::string const & bound, utils::kernel_generation_stream & stream, std::string & init, std::string & upper_bound, std::string & inc, std::string const & domain_id, std::string const & domain_size)
+  {
+    if (policy==FETCH_FROM_GLOBAL_STRIDED)
+    {
+      init = domain_id;
+      upper_bound = bound;
+      inc = domain_size;
+    }
+    else if (policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
+    {
+      std::string chunk_size = "chunk_size";
+      std::string chunk_start = "chunk_start";
+      std::string chunk_end = "chunk_end";
+
+      stream << "unsigned int " << chunk_size << " = (" << bound << "+" << domain_size << "-1)/" << domain_size << ";" << std::endl;
+      stream << "unsigned int " << chunk_start << " =" << domain_id << "*" << chunk_size << ";" << std::endl;
+      stream << "unsigned int " << chunk_end << " = min(" << chunk_start << "+" << chunk_size << ", " << bound << ");" << std::endl;
+      init = chunk_start;
+      upper_bound = chunk_end;
+      inc = "1";
+    }
+  }
+
+  static bool is_node_trans(scheduler::statement::container_type const & array, vcl_size_t root_idx, leaf_t leaf_type)
+  {
+    bool res = false;
+    scheduler::lhs_rhs_element scheduler::statement_node::*ptr;
+    if (leaf_type==LHS_NODE_TYPE)
+      ptr = &scheduler::statement_node::lhs;
+    else
+      ptr = &scheduler::statement_node::rhs;
+    scheduler::statement_node const * node = &array[root_idx];
+    while ((node->*ptr).type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+    {
+      if (array[(node->*ptr).node_index].op.type==scheduler::OPERATION_UNARY_TRANS_TYPE)
+        res = !res;
+      node = &array[(node->*ptr).node_index];
+    }
+    return res;
+  }
+
+protected:
+
+  static std::string append_simd_suffix(std::string const & str, unsigned int i)
+  {
+    assert(i < 16);
+    static char suffixes[] = {'0','1','2','3','4','5','6','7','8','9',
+                             'a','b','c','d','e','f'};
+    return str + tools::to_string(suffixes[i]);
+  }
+
+  static bool is_striding_operator(scheduler::statement_node const & node)
+  {
+    return node.op.type==scheduler::OPERATION_BINARY_MATRIX_COLUMN_TYPE
+            || node.op.type==scheduler::OPERATION_BINARY_MATRIX_ROW_TYPE
+            || node.op.type==scheduler::OPERATION_BINARY_MATRIX_DIAG_TYPE;
+  }
+
+  static bool has_strided_access(statements_container const & statements)
+  {
+    for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
+    {
+      //checks for vectors
+      std::vector<scheduler::lhs_rhs_element> vectors;
+      tree_parsing::traverse(*it, it->root(), tree_parsing::filter_elements(scheduler::DENSE_VECTOR_TYPE, vectors), true);
+      for (std::vector<scheduler::lhs_rhs_element>::iterator itt = vectors.begin(); itt != vectors.end(); ++itt)
+        if (utils::call_on_vector(*itt, utils::stride_fun())>1)
+          return true;
+
+      //checks for matrix
+      std::vector<scheduler::lhs_rhs_element> matrices;
+      tree_parsing::traverse(*it, it->root(), tree_parsing::filter_elements(scheduler::DENSE_MATRIX_TYPE, matrices), true);
+      for (std::vector<scheduler::lhs_rhs_element>::iterator itt = matrices.begin(); itt != matrices.end(); ++itt)
+        if (utils::call_on_matrix(*itt, utils::stride1_fun())>1 || utils::call_on_matrix(*itt, utils::stride2_fun())>2)
+          return true;
+
+      std::vector<vcl_size_t> striding_operators;
+      tree_parsing::traverse(*it, it->root(), tree_parsing::filter(&is_striding_operator, striding_operators), false);
+      if(striding_operators.size() > 0)
+          return true;
+    }
+    return false;
+  }
+
+  static vcl_size_t vector_size(scheduler::statement_node const & node, bool up_to_internal_size)
+  {
+    using namespace scheduler;
+    using namespace utils;
+    if (node.op.type==OPERATION_BINARY_MATRIX_DIAG_TYPE)
+    {
+      vcl_size_t size1 = up_to_internal_size?call_on_matrix(node.lhs, internal_size1_fun()):call_on_matrix(node.lhs, size1_fun());
+      vcl_size_t size2 = up_to_internal_size?call_on_matrix(node.lhs, internal_size2_fun()):call_on_matrix(node.lhs, size2_fun());
+      return std::min<vcl_size_t>(size1, size2);
+    }
+    else if (node.op.type==OPERATION_BINARY_MATRIX_ROW_TYPE)
+      return up_to_internal_size?call_on_matrix(node.lhs, internal_size2_fun()):call_on_matrix(node.lhs, size2_fun());
+    else if (node.op.type==OPERATION_BINARY_MATRIX_COLUMN_TYPE)
+      return up_to_internal_size?call_on_matrix(node.lhs, internal_size1_fun()):call_on_matrix(node.lhs, size1_fun());
+    else
+      return up_to_internal_size?call_on_vector(node.lhs, internal_size_fun()):call_on_vector(node.lhs, size_fun());
+  }
+
+  //NB : templates are not used here because declaring a functor out of the generate() functions would be harder to read
+  struct loop_body_base
+  {
+    virtual void operator()(utils::kernel_generation_stream & stream, unsigned int simd_width) const = 0;
+    virtual ~loop_body_base() {}
+  };
+
+  static void element_wise_loop_1D(utils::kernel_generation_stream & stream, loop_body_base const & loop_body,
+                                   fetching_policy_type fetch, unsigned int simd_width, std::string const & i, std::string const & bound, std::string const & domain_id, std::string const & domain_size)
+  {
+    std::string strwidth = tools::to_string(simd_width);
+    std::string boundround = bound + "/" + strwidth;
+
+    std::string init, upper_bound, inc;
+    fetching_loop_info(fetch, boundround, stream, init, upper_bound, inc, domain_id, domain_size);
+    stream << "for(unsigned int " << i << " = " << init << "; " << i << " < " << upper_bound << "; " << i << " += " << inc << ")" << std::endl;
+    stream << "{" << std::endl;
+    stream.inc_tab();
+    loop_body(stream, simd_width);
+    stream.dec_tab();
+    stream << "}" << std::endl;
+
+    if (simd_width>1)
+    {
+      stream << "for(unsigned int " << i << " = " << boundround << "*" << strwidth << " + " << domain_id << "; " << i << " < " << bound << "; " << i << " += " + domain_size + ")" << std::endl;
+      stream << "{" << std::endl;
+      stream.inc_tab();
+      loop_body(stream, 1);
+      stream.dec_tab();
+      stream << "}" << std::endl;
+    }
+  }
+
+  static std::string vstore(unsigned int simd_width, std::string const & value, std::string const & offset, std::string const & ptr)
+  {
+    if (simd_width==1)
+      return "(" + ptr + ")[" + offset + "] = " + value;
+    else
+      return utils::append_width("vstore", simd_width) + "(" + value + ", " + offset + ", " + ptr + ")";
+  }
+
+  static std::string vload(unsigned int simd_width, std::string const & offset, std::string const & ptr)
+  {
+    if (simd_width==1)
+      return "(" + ptr + ")[" + offset + "]";
+    else
+      return utils::append_width("vload", simd_width) + "(" + offset + ", " + ptr + ")";
+  }
+
+private:
+  /** @brief Generates the body of the associated kernel function */
+  virtual std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mapping) const = 0;
+
+public:
+  template_base(binding_policy_t binding_policy) : binding_policy_(binding_policy) {}
+
+  virtual ~template_base(){ }
+
+  std::vector<std::string> generate(std::string const & kernel_prefix, statements_container const & statements, viennacl::ocl::device const & device)
+  {
+    statements_container::data_type::const_iterator sit;
+    std::vector<mapping_type>::iterator mit;
+
+    if(int err = check_invalid(statements, device))
+      throw generator_not_supported_exception("The supplied parameters for this template are invalid : err " + tools::to_string(err));
+
+    //Create mapping
+    std::vector<mapping_type> mappings(statements.data().size());
+    tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy_);
+    for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++sit, ++mit)
+      tree_parsing::traverse(*sit, sit->root(), map_functor(*binder,*mit), true);
+
+    return generate_impl(kernel_prefix, statements, mappings);
+  }
+
+  /** @brief returns whether or not the profile has undefined behavior on particular device */
+  virtual int check_invalid(statements_container const & statements, viennacl::ocl::device const & device) const = 0;
+
+  virtual void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements) = 0;
+
+  virtual tools::shared_ptr<template_base> clone() const = 0;
+private:
+  binding_policy_t binding_policy_;
+};
+
+
+template<class TemplateType, class ParametersType>
+class template_base_impl : public template_base
+{
+private:
+  virtual int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const { return TEMPLATE_VALID; }
+
+  virtual unsigned int n_lmem_elements() const { return 0; }
+
+public:
+  typedef ParametersType parameters_type;
+
+  /** @brief The constructor */
+  template_base_impl(parameters_type const & parameters, binding_policy_t binding_policy) : template_base(binding_policy), p_(parameters){ }
+
+  parameters_type const & parameters() const
+  {
+    return p_;
+  }
+
+  tools::shared_ptr<template_base> clone() const
+  {
+    return tools::shared_ptr<template_base>(new TemplateType(*dynamic_cast<TemplateType const *>(this)));
+  }
+
+  /** @brief returns whether or not the profile has undefined behavior on particular device */
+  int check_invalid(statements_container const & statements, viennacl::ocl::device const & device) const
+  {
+    using namespace viennacl::tools;
+
+    scheduler::statement const & statement = statements.data().front();
+    unsigned int scalartype_size = utils::size_of(lhs_most(statement.array(), statement.root()).lhs.numeric_type);
+
+    //Query device informations
+    vcl_size_t lmem_available = static_cast<vcl_size_t>(device.local_mem_size());
+    vcl_size_t lmem_usage = scalartype_size*n_lmem_elements();
+    if (lmem_usage>lmem_available)
+      return TEMPLATE_LOCAL_MEMORY_OVERFLOW;
+
+    //Invalid work group size
+    vcl_size_t max_workgroup_size = device.max_work_group_size();
+    std::vector<vcl_size_t> max_work_item_sizes = device.max_work_item_sizes();
+    if (p_.local_size_0*p_.local_size_1 > max_workgroup_size)
+      return TEMPLATE_WORK_GROUP_SIZE_OVERFLOW;
+    if (p_.local_size_0 > max_work_item_sizes[0])
+      return TEMPLATE_LOCAL_SIZE_0_OVERFLOW;
+
+    if (p_.local_size_1 > max_work_item_sizes[1])
+      return TEMPLATE_LOCAL_SIZE_1_OVERFLOW;
+
+    //Advice from the Intel guide
+    unsigned int warp_size = 8;
+    if (device.type()==CL_DEVICE_TYPE_GPU)
+    {
+      //Advice from the nvidia guide
+      warp_size = 32;
+      //Advice from the AMD guide
+      if (device.vendor_id()==4098)
+        warp_size = 64;
+    }
+    if (((p_.local_size_0*p_.local_size_1)%warp_size)>0)
+      return TEMPLATE_LOCAL_SIZE_NOT_WARP_MULTIPLE;
+
+    //Invalid SIMD Width
+    if (p_.simd_width!=1 && p_.simd_width!=2 &&
+        p_.simd_width!=4 && p_.simd_width!=8 &&
+        p_.simd_width!=16)
+      return TEMPLATE_INVALID_SIMD_WIDTH;
+
+    return check_invalid_impl(device);
+  }
+
+protected:
+  parameters_type p_;
+};
+
+}
+}
+
+#endif


[10/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_matrix.hpp
new file mode 100644
index 0000000..8645e7d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_matrix.hpp
@@ -0,0 +1,1703 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/compressed_matrix.hpp
+ *  @brief OpenCL kernel file for compressed_matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_compressed_matrix_block_trans_lu_backward(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void block_trans_lu_backward( \n");
+  source.append("  __global const unsigned int * row_jumper_U,  \n");     //U part (note that U is transposed in memory)
+  source.append("  __global const unsigned int * column_indices_U, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements_U, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * diagonal_U, \n");
+  source.append("  __global const unsigned int * block_offsets, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
+  source.append("  unsigned int col_stop  = block_offsets[2*get_group_id(0)+1]; \n");
+  source.append("  unsigned int row_start; \n");
+  source.append("  unsigned int row_stop; \n");
+  source.append("  "); source.append(numeric_string); source.append(" result_entry = 0; \n");
+
+  source.append("  if (col_start >= col_stop) \n");
+  source.append("    return; \n");
+
+    //backward elimination, using U and diagonal_U
+  source.append("  for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n");
+  source.append("  { \n");
+  source.append("    unsigned int col = (col_stop - iter) - 1; \n");
+  source.append("    result_entry = result[col] / diagonal_U[col]; \n");
+  source.append("    row_start = row_jumper_U[col]; \n");
+  source.append("    row_stop  = row_jumper_U[col + 1]; \n");
+  source.append("    for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
+  source.append("      result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; \n");
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("  } \n");
+
+    //divide result vector by diagonal:
+  source.append("  for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n");
+  source.append("    result[col] /= diagonal_U[col]; \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_block_trans_unit_lu_forward(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void block_trans_unit_lu_forward( \n");
+  source.append("  __global const unsigned int * row_jumper_L,  \n");     //L part (note that L is transposed in memory)
+  source.append("  __global const unsigned int * column_indices_L, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements_L, \n");
+  source.append("  __global const unsigned int * block_offsets, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
+  source.append("  unsigned int col_stop  = block_offsets[2*get_group_id(0)+1]; \n");
+  source.append("  unsigned int row_start = row_jumper_L[col_start]; \n");
+  source.append("  unsigned int row_stop; \n");
+  source.append("  "); source.append(numeric_string); source.append(" result_entry = 0; \n");
+
+  source.append("  if (col_start >= col_stop) \n");
+  source.append("    return; \n");
+
+    //forward elimination, using L:
+  source.append("  for (unsigned int col = col_start; col < col_stop; ++col) \n");
+  source.append("  { \n");
+  source.append("    result_entry = result[col]; \n");
+  source.append("    row_stop = row_jumper_L[col + 1]; \n");
+  source.append("    for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
+  source.append("      result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n");
+  source.append("    row_start = row_stop; \n"); //for next iteration (avoid unnecessary loads from GPU RAM)
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("  } \n");
+
+  source.append("}; \n");
+}
+
+namespace detail
+{
+  /** @brief Generate kernel for C = A * B with A being a compressed_matrix, B and C dense */
+  template<typename StringT>
+  void generate_compressed_matrix_dense_matrix_mult(StringT & source, std::string const & numeric_string,
+                                                    bool B_transposed, bool B_row_major, bool C_row_major)
+  {
+    source.append("__kernel void ");
+    source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
+    source.append("( \n");
+    source.append("  __global const unsigned int * sp_mat_row_indices, \n");
+    source.append("  __global const unsigned int * sp_mat_col_indices, \n");
+    source.append("  __global const "); source.append(numeric_string); source.append(" * sp_mat_elements, \n");
+    source.append("  __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
+    source.append("  unsigned int d_mat_row_start, \n");
+    source.append("  unsigned int d_mat_col_start, \n");
+    source.append("  unsigned int d_mat_row_inc, \n");
+    source.append("  unsigned int d_mat_col_inc, \n");
+    source.append("  unsigned int d_mat_row_size, \n");
+    source.append("  unsigned int d_mat_col_size, \n");
+    source.append("  unsigned int d_mat_internal_rows, \n");
+    source.append("  unsigned int d_mat_internal_cols, \n");
+    source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+    source.append("  unsigned int result_row_start, \n");
+    source.append("  unsigned int result_col_start, \n");
+    source.append("  unsigned int result_row_inc, \n");
+    source.append("  unsigned int result_col_inc, \n");
+    source.append("  unsigned int result_row_size, \n");
+    source.append("  unsigned int result_col_size, \n");
+    source.append("  unsigned int result_internal_rows, \n");
+    source.append("  unsigned int result_internal_cols) { \n");
+
+      // split work rows (sparse matrix rows) to thread groups
+    source.append("  for (unsigned int row = get_group_id(0); row < result_row_size; row += get_num_groups(0)) { \n");
+
+    source.append("    unsigned int row_start = sp_mat_row_indices[row]; \n");
+    source.append("    unsigned int row_end = sp_mat_row_indices[row+1]; \n");
+
+        // split result cols between threads in a thread group
+    source.append("    for ( unsigned int col = get_local_id(0); col < result_col_size; col += get_local_size(0) ) { \n");
+
+    source.append("      "); source.append(numeric_string); source.append(" r = 0; \n");
+
+    source.append("      for (unsigned int k = row_start; k < row_end; k ++) { \n");
+
+    source.append("        unsigned int j = sp_mat_col_indices[k]; \n");
+    source.append("        "); source.append(numeric_string); source.append(" x = sp_mat_elements[k]; \n");
+
+    source.append("        "); source.append(numeric_string);
+    if (B_transposed && B_row_major)
+      source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start +   j * d_mat_col_inc ]; \n");
+    else if (B_transposed && !B_row_major)
+      source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc)                       + (d_mat_col_start +  j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+    else if (!B_transposed && B_row_major)
+      source.append(" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
+    else
+      source.append(" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc)                       + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+    source.append("        r += x * y; \n");
+    source.append("      } \n");
+
+    if (C_row_major)
+      source.append("      result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
+    else
+      source.append("      result[ (result_row_start + row * result_row_inc)                        + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
+    source.append("    } \n");
+    source.append("  } \n");
+
+    source.append("} \n");
+
+  }
+}
+template<typename StringT>
+void generate_compressed_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
+{
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, false);
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false,  true);
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false,  true, false);
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false,  true,  true);
+
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, false);
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false,  true);
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true,  true, false);
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true,  true,  true);
+}
+
+template<typename StringT>
+void generate_compressed_matrix_jacobi(StringT & source, std::string const & numeric_string)
+{
+
+ source.append(" __kernel void jacobi( \n");
+ source.append("  __global const unsigned int * row_indices, \n");
+ source.append("  __global const unsigned int * column_indices, \n");
+ source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append("  "); source.append(numeric_string); source.append(" weight, \n");
+ source.append("  __global const "); source.append(numeric_string); source.append(" * old_result, \n");
+ source.append("  __global "); source.append(numeric_string); source.append(" * new_result, \n");
+ source.append("  __global const "); source.append(numeric_string); source.append(" * rhs, \n");
+ source.append("  unsigned int size) \n");
+ source.append("  { \n");
+ source.append("   "); source.append(numeric_string); source.append(" sum, diag=1; \n");
+ source.append("   int col; \n");
+ source.append("   for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append("   { \n");
+ source.append("     sum = 0; \n");
+ source.append("     for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++) \n");
+ source.append("     { \n");
+ source.append("       col = column_indices[j]; \n");
+ source.append("       if (i == col) \n");
+ source.append("         diag = elements[j]; \n");
+ source.append("       else \n");
+ source.append("         sum += elements[j] * old_result[col]; \n");
+ source.append("     } \n");
+ source.append("     new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n");
+ source.append("    } \n");
+ source.append("  } \n");
+
+}
+
+
+template<typename StringT>
+void generate_compressed_matrix_lu_backward(StringT & source, std::string const & numeric_string)
+{
+  // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void lu_backward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local unsigned int col_index_buffer[128]; \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int current_row = size-1; \n");
+  source.append("  unsigned int row_at_window_start = size-1; \n");
+  source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
+  source.append("  "); source.append(numeric_string); source.append(" diagonal_entry = 0; \n");
+  source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
+  source.append("  unsigned int next_row = row_indices[size-1]; \n");
+
+  source.append("  unsigned int i = loop_end + get_local_id(0); \n");
+  source.append("  while (1) \n");
+  source.append("  { \n");
+      //load into shared memory (coalesced access):
+  source.append("    if (i < nnz) \n");
+  source.append("    { \n");
+  source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
+  source.append("      unsigned int tmp = column_indices[i]; \n");
+  source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
+  source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      //now a single thread does the remaining work in shared memory:
+  source.append("    if (get_local_id(0) == 0) \n");
+  source.append("    { \n");
+        // traverse through all the loaded data from back to front:
+  source.append("      for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
+  source.append("      { \n");
+  source.append("        unsigned int k = (get_local_size(0) - k2) - 1; \n");
+
+  source.append("        if (i+k >= nnz) \n");
+  source.append("          continue; \n");
+
+  source.append("        if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
+  source.append("          current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+  source.append("        else if (col_index_buffer[k] > current_row) \n"); //use buffered data
+  source.append("          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+  source.append("        else if (col_index_buffer[k] == current_row) \n");
+  source.append("          diagonal_entry = element_buffer[k]; \n");
+
+  source.append("        if (i+k == next_row) \n"); //current row is finished. Write back result
+  source.append("        { \n");
+  source.append("          vector[current_row] = current_vector_entry / diagonal_entry; \n");
+  source.append("          if (current_row > 0) //load next row's data \n");
+  source.append("          { \n");
+  source.append("            --current_row; \n");
+  source.append("            next_row = row_indices[current_row]; \n");
+  source.append("            current_vector_entry = vector[current_row]; \n");
+  source.append("          } \n");
+  source.append("        } \n");
+
+
+  source.append("      } \n"); // for k
+
+  source.append("      row_at_window_start = current_row; \n");
+  source.append("    } \n"); // if (get_local_id(0) == 0)
+
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+  source.append("    if (i < get_local_size(0)) \n");
+  source.append("      break; \n");
+
+  source.append("    i -= get_local_size(0); \n");
+  source.append("  } \n"); //for i
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_lu_forward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void lu_forward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local unsigned int col_index_buffer[128]; \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int current_row = 0; \n");
+  source.append("  unsigned int row_at_window_start = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
+  source.append("  "); source.append(numeric_string); source.append(" diagonal_entry; \n");
+  source.append("  unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
+  source.append("  unsigned int next_row = row_indices[1]; \n");
+
+  source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+  source.append("  { \n");
+      //load into shared memory (coalesced access):
+  source.append("    if (i < nnz) \n");
+  source.append("    { \n");
+  source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
+  source.append("      unsigned int tmp = column_indices[i]; \n");
+  source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
+  source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      //now a single thread does the remaining work in shared memory:
+  source.append("    if (get_local_id(0) == 0) \n");
+  source.append("    { \n");
+        // traverse through all the loaded data:
+  source.append("      for (unsigned int k=0; k<get_local_size(0); ++k) \n");
+  source.append("      { \n");
+  source.append("        if (current_row < size && i+k == next_row) \n"); //current row is finished. Write back result
+  source.append("        { \n");
+  source.append("          vector[current_row] = current_vector_entry / diagonal_entry; \n");
+  source.append("          ++current_row; \n");
+  source.append("          if (current_row < size) \n"); //load next row's data
+  source.append("          { \n");
+  source.append("            next_row = row_indices[current_row+1]; \n");
+  source.append("            current_vector_entry = vector[current_row]; \n");
+  source.append("          } \n");
+  source.append("        } \n");
+
+  source.append("        if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
+  source.append("        { \n");
+  source.append("          if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
+  source.append("            current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+  source.append("          else if (col_index_buffer[k] < current_row) \n"); //use buffered data
+  source.append("            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+  source.append("        } \n");
+  source.append("        else if (col_index_buffer[k] == current_row) \n");
+  source.append("          diagonal_entry = element_buffer[k]; \n");
+
+  source.append("      } \n"); // for k
+
+  source.append("      row_at_window_start = current_row; \n");
+  source.append("    } \n"); // if (get_local_id(0) == 0)
+
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("  } \n"); //for i
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_row_info_extractor(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void row_info_extractor( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  unsigned int option \n");
+  source.append("  ) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" value = 0; \n");
+  source.append("    unsigned int row_end = row_indices[row+1]; \n");
+
+  source.append("    switch (option) \n");
+  source.append("    { \n");
+  source.append("      case 0: \n"); //inf-norm
+  source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+  source.append("          value = max(value, fabs(elements[i])); \n");
+  source.append("        break; \n");
+
+  source.append("      case 1: \n"); //1-norm
+  source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+  source.append("          value += fabs(elements[i]); \n");
+  source.append("        break; \n");
+
+  source.append("      case 2: \n"); //2-norm
+  source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+  source.append("          value += elements[i] * elements[i]; \n");
+  source.append("        value = sqrt(value); \n");
+  source.append("        break; \n");
+
+  source.append("      case 3: \n"); //diagonal entry
+  source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+  source.append("        { \n");
+  source.append("          if (column_indices[i] == row) \n");
+  source.append("          { \n");
+  source.append("            value = elements[i]; \n");
+  source.append("            break; \n");
+  source.append("          } \n");
+  source.append("        } \n");
+  source.append("        break; \n");
+
+  source.append("      default: \n");
+  source.append("        break; \n");
+  source.append("    } \n");
+  source.append("    result[row] = value; \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_trans_lu_backward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void trans_lu_backward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local unsigned int row_index_lookahead[256]; \n");
+  source.append("  __local unsigned int row_index_buffer[256]; \n");
+
+  source.append("  unsigned int row_index; \n");
+  source.append("  unsigned int col_index; \n");
+  source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int row_at_window_start = size; \n");
+  source.append("  unsigned int row_at_window_end; \n");
+  source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+  source.append("  for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
+  source.append("  { \n");
+  source.append("    unsigned int i = (nnz - i2) - 1; \n");
+  source.append("    col_index    = (i2 < nnz) ? column_indices[i] : 0; \n");
+  source.append("    matrix_entry = (i2 < nnz) ? elements[i]       : 0; \n");
+  source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    if (i2 < nnz) \n");
+  source.append("    { \n");
+  source.append("      unsigned int row_index_dec = 0; \n");
+  source.append("      while (row_index_lookahead[row_index_dec] > i) \n");
+  source.append("        ++row_index_dec; \n");
+  source.append("      row_index = row_at_window_start - row_index_dec; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
+  source.append("    } \n");
+  source.append("    else \n");
+  source.append("    { \n");
+  source.append("      row_index = size+1; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = 0; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    row_at_window_start = row_index_buffer[0]; \n");
+  source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
+
+      //backward elimination
+  source.append("    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
+  source.append("    { \n");
+  source.append("      unsigned int row = row_at_window_start - row2; \n");
+  source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
+
+  source.append("      if ( (row_index == row) && (col_index < row) ) \n");
+  source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
+
+  source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("    } \n");
+
+  source.append("    row_at_window_start = row_at_window_end; \n");
+  source.append("  } \n");
+
+    // final step: Divide vector by diagonal entries:
+  source.append("  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
+  source.append("    vector[i] /= diagonal_entries[i]; \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_trans_lu_forward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void trans_lu_forward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local unsigned int row_index_lookahead[256]; \n");
+  source.append("  __local unsigned int row_index_buffer[256]; \n");
+
+  source.append("  unsigned int row_index; \n");
+  source.append("  unsigned int col_index; \n");
+  source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int row_at_window_start = 0; \n");
+  source.append("  unsigned int row_at_window_end = 0; \n");
+  source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+  source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+  source.append("  { \n");
+  source.append("    col_index    = (i < nnz) ? column_indices[i] : 0; \n");
+  source.append("    matrix_entry = (i < nnz) ? elements[i]       : 0; \n");
+  source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    if (i < nnz) \n");
+  source.append("    { \n");
+  source.append("      unsigned int row_index_inc = 0; \n");
+  source.append("      while (i >= row_index_lookahead[row_index_inc + 1]) \n");
+  source.append("        ++row_index_inc; \n");
+  source.append("      row_index = row_at_window_start + row_index_inc; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
+  source.append("    } \n");
+  source.append("    else \n");
+  source.append("    { \n");
+  source.append("      row_index = size+1; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = size - 1; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    row_at_window_start = row_index_buffer[0]; \n");
+  source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
+
+      //forward elimination
+  source.append("    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
+  source.append("    { \n");
+  source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
+
+  source.append("      if ( (row_index == row) && (col_index > row) ) \n");
+  source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
+
+  source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("    } \n");
+
+  source.append("    row_at_window_start = row_at_window_end; \n");
+  source.append("  } \n");
+
+    // final step: Divide vector by diagonal entries:
+  source.append("  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
+  source.append("    vector[i] /= diagonal_entries[i]; \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_trans_unit_lu_backward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void trans_unit_lu_backward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local unsigned int row_index_lookahead[256]; \n");
+  source.append("  __local unsigned int row_index_buffer[256]; \n");
+
+  source.append("  unsigned int row_index; \n");
+  source.append("  unsigned int col_index; \n");
+  source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int row_at_window_start = size; \n");
+  source.append("  unsigned int row_at_window_end; \n");
+  source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+  source.append("  for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
+  source.append("  { \n");
+  source.append("    unsigned int i = (nnz - i2) - 1; \n");
+  source.append("    col_index    = (i2 < nnz) ? column_indices[i] : 0; \n");
+  source.append("    matrix_entry = (i2 < nnz) ? elements[i]       : 0; \n");
+  source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    if (i2 < nnz) \n");
+  source.append("    { \n");
+  source.append("      unsigned int row_index_dec = 0; \n");
+  source.append("      while (row_index_lookahead[row_index_dec] > i) \n");
+  source.append("        ++row_index_dec; \n");
+  source.append("      row_index = row_at_window_start - row_index_dec; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
+  source.append("    } \n");
+  source.append("    else \n");
+  source.append("    { \n");
+  source.append("      row_index = size+1; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = 0; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    row_at_window_start = row_index_buffer[0]; \n");
+  source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
+
+      //backward elimination
+  source.append("    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
+  source.append("    { \n");
+  source.append("      unsigned int row = row_at_window_start - row2; \n");
+  source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
+
+  source.append("      if ( (row_index == row) && (col_index < row) ) \n");
+  source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
+
+  source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("    } \n");
+
+  source.append("    row_at_window_start = row_at_window_end; \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_compressed_matrix_trans_unit_lu_forward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void trans_unit_lu_forward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local unsigned int row_index_lookahead[256]; \n");
+  source.append("  __local unsigned int row_index_buffer[256]; \n");
+
+  source.append("  unsigned int row_index; \n");
+  source.append("  unsigned int col_index; \n");
+  source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int row_at_window_start = 0; \n");
+  source.append("  unsigned int row_at_window_end = 0; \n");
+  source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+  source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+  source.append("  { \n");
+  source.append("    col_index    = (i < nnz) ? column_indices[i] : 0; \n");
+  source.append("    matrix_entry = (i < nnz) ? elements[i]       : 0; \n");
+  source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    if (i < nnz) \n");
+  source.append("    { \n");
+  source.append("      unsigned int row_index_inc = 0; \n");
+  source.append("      while (i >= row_index_lookahead[row_index_inc + 1]) \n");
+  source.append("        ++row_index_inc; \n");
+  source.append("      row_index = row_at_window_start + row_index_inc; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
+  source.append("    } \n");
+  source.append("    else \n");
+  source.append("    { \n");
+  source.append("      row_index = size+1; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = size - 1; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    row_at_window_start = row_index_buffer[0]; \n");
+  source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
+
+      //forward elimination
+  source.append("    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
+  source.append("    { \n");
+  source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
+
+  source.append("      if ( (row_index == row) && (col_index > row) ) \n");
+  source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
+
+  source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("    } \n");
+
+  source.append("    row_at_window_start = row_at_window_end; \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_trans_unit_lu_forward_slow(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void trans_unit_lu_forward_slow( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int row = 0; row < size; ++row) \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
+
+  source.append("    unsigned int row_start = row_indices[row]; \n");
+  source.append("    unsigned int row_stop  = row_indices[row + 1]; \n");
+  source.append("    for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n");
+  source.append("    { \n");
+  source.append("      unsigned int col_index = column_indices[entry_index]; \n");
+  source.append("      if (col_index > row) \n");
+  source.append("        vector[col_index] -= result_entry * elements[entry_index]; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_unit_lu_backward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void unit_lu_backward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local  unsigned int col_index_buffer[128]; \n");
+  source.append("  __local  "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+  source.append("  __local  "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int current_row = size-1; \n");
+  source.append("  unsigned int row_at_window_start = size-1; \n");
+  source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
+  source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
+  source.append("  unsigned int next_row = row_indices[size-1]; \n");
+
+  source.append("  unsigned int i = loop_end + get_local_id(0); \n");
+  source.append("  while (1) \n");
+  source.append("  { \n");
+      //load into shared memory (coalesced access):
+  source.append("    if (i < nnz) \n");
+  source.append("    { \n");
+  source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
+  source.append("      unsigned int tmp = column_indices[i]; \n");
+  source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
+  source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      //now a single thread does the remaining work in shared memory:
+  source.append("    if (get_local_id(0) == 0) \n");
+  source.append("    { \n");
+      // traverse through all the loaded data from back to front:
+  source.append("      for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
+  source.append("      { \n");
+  source.append("        unsigned int k = (get_local_size(0) - k2) - 1; \n");
+
+  source.append("        if (i+k >= nnz) \n");
+  source.append("          continue; \n");
+
+  source.append("        if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
+  source.append("          current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+  source.append("        else if (col_index_buffer[k] > current_row) \n"); //use buffered data
+  source.append("          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+
+  source.append("        if (i+k == next_row) \n"); //current row is finished. Write back result
+  source.append("        { \n");
+  source.append("          vector[current_row] = current_vector_entry; \n");
+  source.append("          if (current_row > 0) \n"); //load next row's data
+  source.append("          { \n");
+  source.append("            --current_row; \n");
+  source.append("            next_row = row_indices[current_row]; \n");
+  source.append("            current_vector_entry = vector[current_row]; \n");
+  source.append("          } \n");
+  source.append("        } \n");
+
+
+  source.append("      } \n"); // for k
+
+  source.append("      row_at_window_start = current_row; \n");
+  source.append("    } \n"); // if (get_local_id(0) == 0)
+
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+  source.append("    if (i < get_local_size(0)) \n");
+  source.append("      break; \n");
+
+  source.append("    i -= get_local_size(0); \n");
+  source.append("  } \n"); //for i
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_unit_lu_forward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void unit_lu_forward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local  unsigned int col_index_buffer[128]; \n");
+  source.append("  __local  "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+  source.append("  __local  "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int current_row = 0; \n");
+  source.append("  unsigned int row_at_window_start = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
+  source.append("  unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
+  source.append("  unsigned int next_row = row_indices[1]; \n");
+
+  source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+  source.append("  { \n");
+      //load into shared memory (coalesced access):
+  source.append("    if (i < nnz) \n");
+  source.append("    { \n");
+  source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
+  source.append("      unsigned int tmp = column_indices[i]; \n");
+  source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
+  source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      //now a single thread does the remaining work in shared memory:
+  source.append("    if (get_local_id(0) == 0) \n");
+  source.append("    { \n");
+        // traverse through all the loaded data:
+  source.append("      for (unsigned int k=0; k<get_local_size(0); ++k) \n");
+  source.append("      { \n");
+  source.append("        if (i+k == next_row) \n"); //current row is finished. Write back result
+  source.append("        { \n");
+  source.append("          vector[current_row] = current_vector_entry; \n");
+  source.append("          ++current_row; \n");
+  source.append("          if (current_row < size) //load next row's data \n");
+  source.append("          { \n");
+  source.append("            next_row = row_indices[current_row+1]; \n");
+  source.append("            current_vector_entry = vector[current_row]; \n");
+  source.append("          } \n");
+  source.append("        } \n");
+
+  source.append("        if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
+  source.append("        { \n");
+  source.append("          if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
+  source.append("            current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+  source.append("          else if (col_index_buffer[k] < current_row) \n"); //use buffered data
+  source.append("            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+  source.append("        } \n");
+
+  source.append("      } \n"); // for k
+
+  source.append("      row_at_window_start = current_row; \n");
+  source.append("    } \n"); // if (get_local_id(0) == 0)
+
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("  } //for i \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul_nvidia(StringT & source, std::string const & numeric_string, unsigned int subwarp_size, bool with_alpha_beta)
+{
+  std::stringstream ss;
+  ss << subwarp_size;
+
+  if (with_alpha_beta)
+    source.append("__kernel void vec_mul_nvidia_alpha_beta( \n");
+  else
+    source.append("__kernel void vec_mul_nvidia( \n");
+  source.append("    __global const unsigned int * row_indices, \n");
+  source.append("    __global const unsigned int * column_indices, \n");
+  source.append("  __global const unsigned int * row_blocks, \n");
+  source.append("    __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  unsigned int num_blocks, \n");
+  source.append("    __global const "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("    uint4 layout_x, \n");
+  if (with_alpha_beta) { source.append("    "); source.append(numeric_string); source.append(" alpha, \n"); }
+  source.append("    __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("    uint4 layout_result \n");
+  if (with_alpha_beta) { source.append("    , "); source.append(numeric_string); source.append(" beta \n"); }
+  source.append(") { \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" shared_elements[256]; \n");
+
+  source.append("  const unsigned int id_in_row = get_local_id(0) % " + ss.str() + "; \n");
+  source.append("  const unsigned int block_increment = get_local_size(0) * ((layout_result.z - 1) / (get_global_size(0)) + 1); \n");
+  source.append("  const unsigned int block_start = get_group_id(0) * block_increment; \n");
+  source.append("  const unsigned int block_stop  = min(block_start + block_increment, layout_result.z); \n");
+
+  source.append("  for (unsigned int row  = block_start + get_local_id(0) / " + ss.str() + "; \n");
+  source.append("                    row  < block_stop; \n");
+  source.append("                    row += get_local_size(0) / " + ss.str() + ") \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+  source.append("    unsigned int row_end = row_indices[row+1]; \n");
+  source.append("    for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + ") \n");
+  source.append("      dot_prod += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
+
+  source.append("    shared_elements[get_local_id(0)] = dot_prod; \n");
+  source.append("    #pragma unroll \n");
+  source.append("    for (unsigned int k = 1; k < " + ss.str() + "; k *= 2) \n");
+  source.append("      shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
+
+  source.append("    if (id_in_row == 0) \n");
+  if (with_alpha_beta)
+    source.append("      result[row * layout_result.y + layout_result.x] = alpha * shared_elements[get_local_id(0)] + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+  else
+    source.append("      result[row * layout_result.y + layout_result.x] = shared_elements[get_local_id(0)]; \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+  if (with_alpha_beta)
+    source.append("__kernel void vec_mul_alpha_beta( \n");
+  else
+    source.append("__kernel void vec_mul( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const unsigned int * row_blocks, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  unsigned int num_blocks, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("  uint4 layout_x, \n");
+  if (with_alpha_beta) { source.append("  "); source.append(numeric_string); source.append(" alpha, \n"); }
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  uint4 layout_result \n");
+  if (with_alpha_beta) { source.append("  , "); source.append(numeric_string); source.append(" beta \n"); }
+  source.append(") { \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" shared_elements[1024]; \n");
+
+  source.append("  unsigned int row_start = row_blocks[get_group_id(0)]; \n");
+  source.append("  unsigned int row_stop  = row_blocks[get_group_id(0) + 1]; \n");
+  source.append("  unsigned int rows_to_process = row_stop - row_start; \n");
+  source.append("  unsigned int element_start = row_indices[row_start]; \n");
+  source.append("  unsigned int element_stop = row_indices[row_stop]; \n");
+
+  source.append("  if (rows_to_process > 4) { \n"); // CSR stream
+      // load to shared buffer:
+  source.append("    for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+  source.append("      shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      // use one thread per row to sum:
+  source.append("    for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
+  source.append("      "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+  source.append("      unsigned int thread_row_start = row_indices[row]     - element_start; \n");
+  source.append("      unsigned int thread_row_stop  = row_indices[row + 1] - element_start; \n");
+  source.append("      for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
+  source.append("        dot_prod += shared_elements[i]; \n");
+  if (with_alpha_beta)
+    source.append("      result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+  else
+    source.append("      result[row * layout_result.y + layout_result.x] = dot_prod; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+
+      // use multiple threads for the summation
+  source.append("  else if (rows_to_process > 1) \n"); // CSR stream with local reduction
+  source.append("  {\n");
+      // load to shared buffer:
+  source.append("    for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0))\n");
+  source.append("      shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x];\n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+    // sum each row separately using a reduction:
+  source.append("    for (unsigned int row = row_start; row < row_stop; ++row)\n");
+  source.append("    {\n");
+  source.append("      unsigned int current_row_start = row_indices[row]     - element_start;\n");
+  source.append("      unsigned int current_row_stop  = row_indices[row + 1] - element_start;\n");
+  source.append("      unsigned int thread_base_id  = current_row_start + get_local_id(0);\n");
+
+      // sum whatever exceeds the current buffer:
+  source.append("      for (unsigned int j = thread_base_id + get_local_size(0); j < current_row_stop; j += get_local_size(0))\n");
+  source.append("        shared_elements[thread_base_id] += shared_elements[j];\n");
+
+      // reduction
+  source.append("      for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n");
+  source.append("      {\n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE);\n");
+  source.append("        if (get_local_id(0) < stride && thread_base_id < current_row_stop)\n");
+  source.append("          shared_elements[thread_base_id] += (thread_base_id + stride < current_row_stop) ? shared_elements[thread_base_id+stride] : 0;\n");
+  source.append("      }\n");
+  source.append("      "); source.append(numeric_string); source.append(" row_result = 0; \n");
+  source.append("      if (current_row_stop > current_row_start)\n");
+  source.append("        row_result = shared_elements[current_row_start]; \n");
+  source.append("      if (get_local_id(0) == 0)\n");
+  if (with_alpha_beta)
+    source.append("        result[row * layout_result.y + layout_result.x] = alpha * row_result + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0);\n");
+  else
+    source.append("        result[row * layout_result.y + layout_result.x] = row_result;\n");
+  source.append("    }\n");
+  source.append("  }\n");
+
+
+  source.append("  else  \n"); // CSR vector for a single row
+  source.append("  { \n");
+      // load and sum to shared buffer:
+  source.append("    shared_elements[get_local_id(0)] = 0; \n");
+  source.append("    for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+  source.append("      shared_elements[get_local_id(0)] += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
+
+      // reduction to obtain final result
+  source.append("    for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("      if (get_local_id(0) < stride) \n");
+  source.append("        shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
+  source.append("    } \n");
+
+  source.append("    if (get_local_id(0) == 0) \n");
+  if (with_alpha_beta)
+    source.append("      result[row_start * layout_result.y + layout_result.x] = alpha * shared_elements[0] + ((beta != 0) ? beta * result[row_start * layout_result.y + layout_result.x] : 0); \n");
+  else
+    source.append("      result[row_start * layout_result.y + layout_result.x] = shared_elements[0]; \n");
+  source.append("  } \n");
+
+  source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul4(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+  if (with_alpha_beta)
+    source.append("__kernel void vec_mul4_alpha_beta( \n");
+  else
+    source.append("__kernel void vec_mul4( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const uint4 * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append("4 * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("  uint4 layout_x, \n");
+  if (with_alpha_beta) { source.append("  "); source.append(numeric_string); source.append(" alpha, \n"); }
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  uint4 layout_result \n");
+  if (with_alpha_beta) { source.append("  , "); source.append(numeric_string); source.append(" beta \n"); }
+  source.append(") { \n");
+  source.append("  "); source.append(numeric_string); source.append(" dot_prod; \n");
+  source.append("  unsigned int start, next_stop; \n");
+  source.append("  uint4 col_idx; \n");
+  source.append("  "); source.append(numeric_string); source.append("4 tmp_vec; \n");
+  source.append("  "); source.append(numeric_string); source.append("4 tmp_entries; \n");
+
+  source.append("  for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    dot_prod = 0; \n");
+  source.append("    start = row_indices[row] / 4; \n");
+  source.append("    next_stop = row_indices[row+1] / 4; \n");
+
+  source.append("    for (unsigned int i = start; i < next_stop; ++i) \n");
+  source.append("    { \n");
+  source.append("      col_idx = column_indices[i]; \n");
+
+  source.append("      tmp_entries = elements[i]; \n");
+  source.append("      tmp_vec.x = x[col_idx.x * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.y = x[col_idx.y * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.z = x[col_idx.z * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.w = x[col_idx.w * layout_x.y + layout_x.x]; \n");
+
+  source.append("      dot_prod += dot(tmp_entries, tmp_vec); \n");
+  source.append("    } \n");
+  if (with_alpha_beta)
+    source.append("    result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+  else
+    source.append("    result[row * layout_result.y + layout_result.x] = dot_prod; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul8(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+  if (with_alpha_beta)
+    source.append("__kernel void vec_mul8_alpha_beta( \n");
+  else
+    source.append("__kernel void vec_mul8( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const uint8 * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append("8 * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("  uint4 layout_x, \n");
+  if (with_alpha_beta) { source.append("  "); source.append(numeric_string); source.append(" alpha, \n"); }
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  uint4 layout_result \n");
+  if (with_alpha_beta) { source.append(" , "); source.append(numeric_string); source.append(" beta \n"); }
+  source.append(") { \n");
+  source.append("  "); source.append(numeric_string); source.append(" dot_prod; \n");
+  source.append("  unsigned int start, next_stop; \n");
+  source.append("  uint8 col_idx; \n");
+  source.append("  "); source.append(numeric_string); source.append("8 tmp_vec; \n");
+  source.append("  "); source.append(numeric_string); source.append("8 tmp_entries; \n");
+
+  source.append("  for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    dot_prod = 0; \n");
+  source.append("    start = row_indices[row] / 8; \n");
+  source.append("    next_stop = row_indices[row+1] / 8; \n");
+
+  source.append("    for (unsigned int i = start; i < next_stop; ++i) \n");
+  source.append("    { \n");
+  source.append("      col_idx = column_indices[i]; \n");
+
+  source.append("      tmp_entries = elements[i]; \n");
+  source.append("      tmp_vec.s0 = x[col_idx.s0 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s1 = x[col_idx.s1 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s2 = x[col_idx.s2 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s3 = x[col_idx.s3 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s4 = x[col_idx.s4 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s5 = x[col_idx.s5 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s6 = x[col_idx.s6 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s7 = x[col_idx.s7 * layout_x.y + layout_x.x]; \n");
+
+  source.append("      dot_prod += dot(tmp_entries.lo, tmp_vec.lo); \n");
+  source.append("      dot_prod += dot(tmp_entries.hi, tmp_vec.hi); \n");
+  source.append("    } \n");
+  if (with_alpha_beta)
+    source.append("    result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+  else
+    source.append("    result[row * layout_result.y + layout_result.x] = dot_prod; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul_cpu(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void vec_mul_cpu( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  "); source.append(numeric_string); source.append(" alpha, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  "); source.append(numeric_string); source.append(" beta, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1); \n");
+  source.append("  unsigned int row_start = get_global_id(0) * work_per_item; \n");
+  source.append("  unsigned int row_stop  = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size); \n");
+  source.append("  for (unsigned int row = row_start; row < row_stop; ++row) \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" dot_prod = ("); source.append(numeric_string); source.append(")0; \n");
+  source.append("    unsigned int row_end = row_indices[row+1]; \n");
+  source.append("    for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+  source.append("      dot_prod += elements[i] * vector[column_indices[i]]; \n");
+  source.append("    result[row] = alpha * dot_prod + ((beta != 0) ? beta * result[row] : 0); \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+
+
+/** @brief OpenCL kernel for the first stage of sparse matrix-matrix multiplication.
+  *
+  * Each work group derives the maximum of nonzero entries encountered by row in A.
+  **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_1(StringT & source)
+{
+  source.append("__kernel void spgemm_stage1( \n");
+  source.append("  __global const unsigned int * A_row_indices, \n");
+  source.append("  __global const unsigned int * A_column_indices, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  __global unsigned int * group_nnz_array) \n");
+  source.append("{ \n");
+  source.append("  unsigned int work_per_item = max((uint) ((A_size1 - 1) / get_global_size(0) + 1), (uint) 1); \n");
+  source.append("  unsigned int row_start = get_global_id(0) * work_per_item; \n");
+  source.append("  unsigned int row_stop  = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) A_size1); \n");
+  source.append("  unsigned int max_A_nnz = 0; \n");
+  source.append("  for (unsigned int row = row_start; row < row_stop; ++row) \n");
+  source.append("    max_A_nnz = max(max_A_nnz, A_row_indices[row + 1] - A_row_indices[row]); \n");
+
+    // load and sum to shared buffer:
+  source.append("  __local unsigned int shared_nnz[256]; \n");
+  source.append("  shared_nnz[get_local_id(0)] = max_A_nnz; \n");
+
+    // reduction to obtain final result
+  source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) \n");
+  source.append("      shared_nnz[get_local_id(0)] = max(shared_nnz[get_local_id(0)], shared_nnz[get_local_id(0) + stride]); \n");
+  source.append("  } \n");
+
+  source.append("  if (get_local_id(0) == 0) \n");
+  source.append("    group_nnz_array[get_group_id(0)] = shared_nnz[0]; \n");
+  source.append("} \n");
+}
+
+
+/** @brief OpenCL kernel for decomposing A in C = A * B, such that A = A_2 * G_1 with G_1 containing at most 32 nonzeros per row
+  *
+  * Needed for the RMerge split stage.
+  **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_decompose_1(StringT & source)
+{
+  source.append("__kernel void spgemm_decompose_1( \n");
+  source.append("  __global const unsigned int * A_row_indices, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  unsigned int max_per_row, \n");
+  source.append("  __global unsigned int * chunks_per_row) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int row = get_global_id(0); row < A_size1; row += get_global_size(0)) {\n");
+  source.append("    unsigned int num_entries = A_row_indices[row+1] - A_row_indices[row]; \n");
+  source.append("    chunks_per_row[row] = (num_entries < max_per_row) ? 1 : ((num_entries - 1) / max_per_row + 1); \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+
+/** @brief OpenCL kernel for filling A_2 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonzeros per row
+  *
+  * Needed for the RMerge split stage.
+  **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_A2(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void spgemm_A2( \n");
+  source.append("  __global unsigned int *A2_row_indices, \n");
+  source.append("  __global unsigned int *A2_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" *A2_elements, \n");
+  source.append("  unsigned int A2_size1, \n");
+  source.append("  __global const unsigned int *new_row_buffer) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < A2_size1; i += get_global_size(0)) {\n");
+  source.append("    unsigned int index_start = new_row_buffer[i]; \n");
+  source.append("    unsigned int index_stop  = new_row_buffer[i+1]; \n");
+
+  source.append("    A2_row_indices[i] = index_start; \n");
+
+  source.append("    for (unsigned int j = index_start; j < index_stop; ++j) { \n");
+  source.append("      A2_col_indices[j] = j; \n");
+  source.append("      A2_elements[j] = 1; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+
+  source.append("  if (get_global_id(0) == 0) \n");
+  source.append("    A2_row_indices[A2_size1] = new_row_buffer[A2_size1]; \n");
+  source.append("} \n");
+}
+
+/** @brief OpenCL kernel for filling G_1 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonzeros per row
+  *
+  * Needed for the RMerge split stage.
+  **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_G1(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void spgemm_G1( \n");
+  source.append("  __global unsigned int *G1_row_indices, \n");
+  source.append("  __global unsigned int *G1_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" *G1_elements, \n");
+  source.append("  unsigned int G1_size1, \n");
+  source.append("  __global const unsigned int *A_row_indices, \n");
+  source.append("  __global const unsigned int *A_col_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" *A_elements, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  unsigned int A_nnz, \n");
+  source.append("  unsigned int max_per_row, \n");
+  source.append("  __global const unsigned int *new_row_buffer) \n");
+  source.append("{ \n");
+
+  // Part 1: Copy column indices and entries:
+  source.append("  for (unsigned int i = get_global_id(0); i < A_nnz; i += get_global_size(0)) {\n");
+  source.append("    G1_col_indices[i] = A_col_indices[i]; \n");
+  source.append("    G1_elements[i]    = A_elements[i]; \n");
+  source.append("  } \n");
+
+  // Part 2: Derive new row indices:
+  source.append("  for (unsigned int i = get_global_id(0); i < A_size1; i += get_global_size(0)) {\n");
+  source.append("    unsigned int old_start = A_row_indices[i]; \n");
+  source.append("    unsigned int new_start = new_row_buffer[i]; \n");
+  source.append("    unsigned int row_chunks = new_row_buffer[i+1] - new_start; \n");
+
+  source.append("    for (unsigned int j=0; j<row_chunks; ++j) \n");
+  source.append("      G1_row_indices[new_start + j] = old_start + j * max_per_row; \n");
+  source.append("  } \n");
+
+  // write last entry in row_buffer with thread 0:
+  source.append("  if (get_global_id(0) == 0) \n");
+  source.append("    G1_row_indices[G1_size1] = A_row_indices[A_size1]; \n");
+  source.append("} \n");
+}
+
+
+
+/** @brief OpenCL kernel for the second stage of sparse matrix-matrix multiplication.
+  *
+  * Computes the exact sparsity pattern of A*B.
+  * Result array C_row_indices contains number of nonzeros in each row.
+  **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_2(StringT & source)
+{
+  source.append("__attribute__((reqd_work_group_size(32, 1, 1))) \n");
+  source.append("__kernel void spgemm_stage2( \n");
+  source.append("  __global const unsigned int * A_row_indices, \n");
+  source.append("  __global const unsigned int * A_col_indices, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  __global const unsigned int * B_row_indices, \n");
+  source.append("  __global const unsigned int * B_col_indices, \n");
+  source.append("  unsigned int B_size2, \n");
+  source.append("  __global unsigned int * C_row_indices) \n");
+  source.append("{ \n");
+  source.append("  unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
+  source.append("  unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
+  source.append("  unsigned int row_C_stop  = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
+  source.append("  __local unsigned int shared_front[32]; \n");
+
+  source.append("  for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
+  source.append("  { \n");
+  source.append("    unsigned int row_A_start = A_row_indices[row_C]; \n");
+  source.append("    unsigned int row_A_end   = A_row_indices[row_C+1]; \n");
+
+  source.append("    unsigned int my_row_B = row_A_start + get_local_id(0); \n");
+  source.append("    unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; \n");
+  source.append("    unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index] : 0; \n");
+  source.append("    unsigned int row_B_end   = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
+
+  source.append("    unsigned int num_nnz = 0; \n");
+  source.append("    if (row_A_end - row_A_start > 1) { \n"); // zero or no row can be processed faster
+
+  source.append("      unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
+  source.append("      while (1) { \n");
+
+  // determine minimum index via reduction:
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        shared_front[get_local_id(0)] = current_front_index; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (get_local_id(0) < 16) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 16]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (get_local_id(0) < 8)  shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 8]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (get_local_id(0) < 4)  shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 4]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (get_local_id(0) < 2)  shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 2]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (get_local_id(0) < 1)  shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 1]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("        if (shared_front[0] == B_size2) break; \n");
+
+  // update front:
+  source.append("        if (current_front_index == shared_front[0]) { \n");
+  source.append("          ++row_B_start; \n");
+  source.append("          current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
+  source.append("        } \n");
+
+  source.append("        ++num_nnz;  \n");
+  source.append("      }  \n");
+  source.append("    } else { num_nnz = row_B_end - row_B_start; }\n");
+
+  // write number of entries found:
+  source.append("    if (get_local_id(0) == 0) \n");
+  source.append("      C_row_indices[row_C] = num_nnz; \n");
+
+  source.append("  } \n");
+
+  source.append("} \n");
+
+}
+
+
+/** @brief OpenCL kernel for the third stage of sparse matrix-matrix multiplication.
+  *
+  * Computes A*B into C with known sparsity pattern (obtained from stages 1 and 2).
+  **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_3(StringT & source, std::string const & numeric_string)
+{
+  source.append("__attribute__((reqd_work_group_size(32, 1, 1))) \n");
+  source.append("__kernel void spgemm_stage3( \n");
+  source.append("  __global const unsigned int * A_row_indices, \n");
+  source.append("  __global const unsigned int * A_col_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * A_elements, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  __global const unsigned int * B_row_indices, \n");
+  source.append("  __global const unsigned int * B_col_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * B_elements, \n");
+  source.append("  unsigned int B_size2, \n");
+  source.append("  __global unsigned int * C_row_indices, \n");
+  source.append("  __global unsigned int * C_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * C_elements) \n");
+  source.append("{ \n");
+  source.append("  unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
+  source.append("  unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
+  source.append("  unsigned int row_C_stop  = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
+  source.append("  __local unsigned int shared_front[32]; \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" shared_front_values[32]; \n");
+  source.append("  unsigned int local_id = get_local_id(0); \n");
+
+  source.append("  for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
+  source.append("  { \n");
+  source.append("    unsigned int row_A_start = A_row_indices[row_C]; \n");
+  source.append("    unsigned int row_A_end   = A_row_indices[row_C+1]; \n");
+
+  source.append("    unsigned int my_row_B = row_A_start + ((row_A_end - row_A_start > 1) ? local_id : 0); \n"); // single row is a special case
+  source.append("    unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B]        : 0; \n");
+  source.append("    unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index]     : 0; \n");
+  source.append("    unsigned int row_B_end   = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
+
+  source.append("    "); source.append(numeric_string); source.append(" val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0; \n");
+  source.append("    unsigned int index_in_C = C_row_indices[row_C] + local_id; \n");
+
+  source.append("    if (row_A_end - row_A_start > 1) { \n"); // zero or no row can be processed faster
+
+  source.append("      unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
+  source.append("      "); source.append(numeric_string); source.append(" current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start]    : 0; \n");
+
+  source.append("      unsigned int index_buffer = 0; \n");
+  source.append("      "); source.append(numeric_string); source.append(" value_buffer = 0; \n");
+  source.append("      unsigned int buffer_size = 0; \n");
+
+  source.append("      while (1) { \n");
+
+  // determine minimum index via reduction:
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        shared_front[local_id] = current_front_index; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 16) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 16]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 8)  shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 8]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 4)  shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 4]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 2)  shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 2]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 1)  shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 1]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("        if (shared_front[0] == B_size2) break; \n");
+
+  // compute output value via reduction:
+  source.append("        shared_front_values[local_id] = (current_front_index == shared_front[0]) ? val_A * current_front_value : 0; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 16) shared_front_values[local_id] += shared_front_values[local_id + 16]; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 8)  shared_front_values[local_id] += shared_front_values[local_id + 8]; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 4)  shared_front_values[local_id] += shared_front_values[local_id + 4]; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 2)  shared_front_values[local_id] += shared_front_values[local_id + 2]; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 1)  shared_front_values[local_id] += shared_front_values[local_id + 1]; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  // update front:
+  source.append("        if (current_front_index == shared_front[0]) { \n");
+  source.append("          ++row_B_start; \n");
+  source.append("          current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
+  source.append("          current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start]    : 0; \n");
+  source.append("        } \n");
+
+  // write current front to register buffer:
+  source.append("        index_buffer = (local_id == buffer_size) ? shared_front[0]        : index_buffer;  \n");
+  source.append("        value_buffer = (local_id == buffer_size) ? shared_front_values[0] : value_buffer;  \n");
+  source.append("        ++buffer_size;  \n");
+
+  // flush register buffer via a coalesced write once full:
+  source.append("        if (buffer_size == get_local_size(0)) {  \n");
+  source.append("          C_col_indices[index_in_C] = index_buffer; \n");
+  source.append("          C_elements[index_in_C]    = value_buffer; \n");
+  source.append("        } \n");
+
+  // the following should be in the previous if-conditional, but a bug in NVIDIA drivers 34x.yz requires this slight rewrite
+  source.append("          index_in_C += (buffer_size == get_local_size(0)) ? get_local_size(0) : 0; \n");
+  source.append("          buffer_size = (buffer_size == get_local_size(0)) ?                0  : buffer_size; \n");
+
+  source.append("      }  \n");
+
+  // write remaining entries in register buffer:
+  source.append("      if (local_id < buffer_size) {  \n");
+  source.append("        C_col_indices[index_in_C] = index_buffer; \n");
+  source.append("        C_elements[index_in_C]    = value_buffer; \n");
+  source.append("      } \n");
+
+  // copy to C in coalesced manner:
+  source.append("    } else { \n");
+  source.append("      for (unsigned int i = row_B_start + local_id; i < row_B_end; i += get_local_size(0)) { \n");
+  source.append("        C_col_indices[index_in_C] = B_col_indices[i]; \n");
+  source.append("        C_elements[index_in_C]    = val_A * B_elements[i]; \n");
+  source.append("        index_in_C += get_local_size(0); \n");
+  source.append("      } \n");
+  source.append("    } \n");
+
+  source.append("  } \n");
+
+  source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod(StringT & source, std::string const & numeric_string)
+{
+  generate_compressed_matrix_compressed_matrix_prod_1(source);
+  generate_compressed_matrix_compressed_matrix_prod_decompose_1(source);
+  generate_compressed_matrix_compressed_matrix_prod_A2(source, numeric_string);
+  generate_compressed_matrix_compressed_matrix_prod_G1(source, numeric_string);
+  generate_compressed_matrix_compressed_matrix_prod_2(source);
+  generate_compressed_matrix_compressed_matrix_prod_3(source, numeric_string);
+}
+
+template<typename StringT>
+void generate_compressed_matrix_assign_to_dense(StringT & source, std::string const & numeric_string)
+{
+
+ source.append(" __kernel void assign_to_dense( \n");
+ source.append("  __global const unsigned int * row_indices, \n");
+ source.append("  __global const unsigned int * column_indices, \n");
+ source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append("  __global "); source.append(numeric_string); source.append(" * B, \n");
+ source.append("  unsigned int B_row_start, \n");
+ source.append("  unsigned int B_col_start, \n");
+ source.append("  unsigned int B_row_inc, \n");
+ source.append("  unsigned int B_col_inc, \n");
+ source.append("  unsigned int B_row_size, \n");
+ source.append("  unsigned int B_col_size, \n");
+ source.append("  unsigned int B_internal_rows, \n");
+ source.append("  unsigned int B_internal_cols) { \n");
+
+ source.append("   for (unsigned int i = get_global_id(0); i < B_row_size; i += get_global_size(0)) \n");
+ source.append("   { \n");
+ source.append("     unsigned int row_end = row_indices[i+1]; \n");
+ source.append("     for (unsigned int j = row_indices[i]; j<row_end; j++) \n");
+ source.append("     { \n");
+ source.append("       B[(B_row_start + i * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j]; \n");
+ source.append("     } \n");
+ source.append("   } \n");
+ source.append("  } \n");
+
+}
+
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for compressed_matrix (except solvers). */
+template<typename NumericT>
+struct compressed_matrix
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_compressed_matrix";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(1024);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      if (numeric_string == "float" || numeric_string == "double")
+      {
+        generate_compressed_matrix_jacobi(source, numeric_string);
+      }
+      generate_compressed_matrix_dense_matrix_multiplication(source, numeric_string);
+      generate_compressed_matrix_row_info_extractor(source, numeric_string);
+      if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+      {
+        generate_compressed_matrix_vec_mul_nvidia(source, numeric_string, 16, true);
+        generate_compressed_matrix_vec_mul_nvidia(source, numeric_string, 16, false);
+      }
+      gene

<TRUNCATED>

[13/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp
new file mode 100644
index 0000000..b350fe0
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp
@@ -0,0 +1,945 @@
+#ifndef VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/iterative_operations.hpp
+    @brief  Implementations of specialized kernels for fast iterative solvers using OpenCL
+*/
+
+#include <cmath>
+
+#include "viennacl/forwards.h"
+#include "viennacl/detail/vector_def.hpp"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/iterative.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+template<typename NumericT>
+void pipelined_cg_vector_update(vector_base<NumericT> & result,
+                                NumericT alpha,
+                                vector_base<NumericT> & p,
+                                vector_base<NumericT> & r,
+                                vector_base<NumericT> const & Ap,
+                                NumericT beta,
+                                vector_base<NumericT> & inner_prod_buffer)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context());
+  viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_vector_update");
+  cl_uint    vec_size = cl_uint(viennacl::traits::size(result));
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, 128*128);
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+  {
+    k.local_work_size(0, 256);
+    k.global_work_size(0, 256*256);
+  }
+
+  viennacl::ocl::enqueue(k(result, alpha, p, r, Ap, beta, inner_prod_buffer, vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))));
+}
+
+template<typename NumericT>
+void pipelined_cg_prod(compressed_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+  bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0));
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), use_nvidia_blocked ? "cg_csr_blocked_prod" : "cg_csr_prod");
+
+  cl_uint vec_size               = cl_uint(viennacl::traits::size(p));
+  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, 128*128);
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+  {
+    k.local_work_size(0, 256);
+    k.global_work_size(0, 256*256);
+  }
+
+  if (use_nvidia_blocked)
+  {
+    viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+                             p,
+                             Ap,
+                             vec_size,
+                             inner_prod_buffer,
+                             buffer_size_per_vector,
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+                            ));
+  }
+  else
+  {
+    viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+                             p,
+                             Ap,
+                             vec_size,
+                             inner_prod_buffer,
+                             buffer_size_per_vector,
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                             viennacl::ocl::local_mem(1024 * sizeof(NumericT))
+                            ));
+  }
+
+}
+
+template<typename NumericT>
+void pipelined_cg_prod(coordinate_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+  cl_uint vec_size               = cl_uint(viennacl::traits::size(p));
+  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+
+  Ap.clear();
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_coo_prod");
+  unsigned int thread_num = 256; //k.local_work_size(0);
+
+  k.local_work_size(0, thread_num);
+
+  k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+
+  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+                           p,
+                           Ap,
+                           vec_size,
+                           viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
+                           viennacl::ocl::local_mem(sizeof(NumericT)*thread_num),
+                           inner_prod_buffer,
+                           buffer_size_per_vector,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+                          ));
+}
+
+template<typename NumericT>
+void pipelined_cg_prod(ell_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+  cl_uint vec_size               = cl_uint(viennacl::traits::size(p));
+  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_ell_prod");
+
+  unsigned int thread_num = 128;
+  unsigned int group_num = 256;
+
+  k.local_work_size(0, thread_num);
+  k.global_work_size(0, thread_num * group_num);
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+  {
+    k.local_work_size(0, 256);
+    k.global_work_size(0, 256*256);
+  }
+
+  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+                           A.handle().opencl_handle(),
+                           cl_uint(A.internal_size1()),
+                           cl_uint(A.maxnnz()),
+                           cl_uint(A.internal_maxnnz()),
+                           viennacl::traits::opencl_handle(p),
+                           viennacl::traits::opencl_handle(Ap),
+                           vec_size,
+                           inner_prod_buffer,
+                           buffer_size_per_vector,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+                          )
+                         );
+}
+
+template<typename NumericT>
+void pipelined_cg_prod(sliced_ell_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+  cl_uint vec_size               = cl_uint(viennacl::traits::size(p));
+  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_sliced_ell_prod");
+
+  vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
+  unsigned int group_num = 256;
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+    thread_num = 256;
+
+  k.local_work_size(0, thread_num);
+  k.global_work_size(0, thread_num * group_num);
+
+  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
+                           A.handle2().opencl_handle(),
+                           A.handle3().opencl_handle(),
+                           A.handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(p),
+                           viennacl::traits::opencl_handle(Ap),
+                           vec_size,
+                           cl_uint(A.rows_per_block()),
+                           inner_prod_buffer,
+                           buffer_size_per_vector,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+                          )
+                        );
+}
+
+
+template<typename NumericT>
+void pipelined_cg_prod(hyb_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+  cl_uint vec_size               = cl_uint(viennacl::traits::size(p));
+  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_hyb_prod");
+
+  unsigned int thread_num = 128;
+  unsigned int group_num = 128;
+
+  k.local_work_size(0, thread_num);
+  k.global_work_size(0, thread_num * group_num);
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+  {
+    k.local_work_size(0, 256);
+    k.global_work_size(0, 256*256);
+  }
+
+  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+                           A.handle().opencl_handle(),
+                           A.handle3().opencl_handle(),
+                           A.handle4().opencl_handle(),
+                           A.handle5().opencl_handle(),
+                           cl_uint(A.internal_size1()),
+                           cl_uint(A.ell_nnz()),
+                           cl_uint(A.internal_ellnnz()),
+                           viennacl::traits::opencl_handle(p),
+                           viennacl::traits::opencl_handle(Ap),
+                           vec_size,
+                           inner_prod_buffer,
+                           buffer_size_per_vector,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+                          )
+                        );
+}
+
+
+//////////////////////////// BiCGStab ////////////////////////
+
+template<typename NumericT>
+void pipelined_bicgstab_update_s(vector_base<NumericT> & s,
+                                 vector_base<NumericT> & r,
+                                 vector_base<NumericT> const & Ap,
+                                 vector_base<NumericT> & inner_prod_buffer,
+                                 vcl_size_t buffer_chunk_size,
+                                 vcl_size_t buffer_chunk_offset)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context());
+  viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_update_s");
+  cl_uint    vec_size = cl_uint(viennacl::traits::size(s));
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, 128*128);
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+  {
+    k.local_work_size(0, 256);
+    k.global_work_size(0, 256*256);
+  }
+
+  cl_uint chunk_size   = cl_uint(buffer_chunk_size);
+  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+  viennacl::ocl::enqueue(k(s, r, Ap,
+                           inner_prod_buffer, chunk_size, chunk_offset, vec_size,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))));
+}
+
+template<typename NumericT>
+void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,
+                                      vector_base<NumericT> & residual, vector_base<NumericT> const & As,
+                                      NumericT beta, vector_base<NumericT> const & Ap,
+                                      vector_base<NumericT> const & r0star,
+                                      vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size)
+{
+  (void)buffer_chunk_size;
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context());
+  viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_vector_update");
+  cl_uint    vec_size = cl_uint(viennacl::traits::size(result));
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, 128*128);
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+  {
+    k.local_work_size(0, 256);
+    k.global_work_size(0, 256*256);
+  }
+
+  viennacl::ocl::enqueue(k(result, alpha, p, omega, s,
+                           residual, As,
+                           beta, Ap,
+                           r0star,
+                           inner_prod_buffer,
+                           vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+                           )
+                         );
+}
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(compressed_matrix<NumericT> const & A,
+                             vector_base<NumericT> const & p,
+                             vector_base<NumericT> & Ap,
+                             vector_base<NumericT> const & r0star,
+                             vector_base<NumericT> & inner_prod_buffer,
+                             vcl_size_t buffer_chunk_size,
+                             vcl_size_t buffer_chunk_offset)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+  bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0));
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), use_nvidia_blocked ? "bicgstab_csr_blocked_prod" : "bicgstab_csr_prod");
+
+  cl_uint vec_size     = cl_uint(viennacl::traits::size(p));
+  cl_uint chunk_size   = cl_uint(buffer_chunk_size);
+  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, 128*128);
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+  {
+    k.local_work_size(0, 256);
+    k.global_work_size(0, 256*256);
+  }
+
+  if (use_nvidia_blocked)
+  {
+    viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+                             p,
+                             Ap,
+                             r0star,
+                             vec_size,
+                             inner_prod_buffer, chunk_size, chunk_offset,
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+                            ));
+  }
+  else
+  {
+    viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+                             p,
+                             Ap,
+                             r0star,
+                             vec_size,
+                             inner_prod_buffer, chunk_size, chunk_offset,
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+                            ));
+  }
+
+}
+
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(coordinate_matrix<NumericT> const & A,
+                             vector_base<NumericT> const & p,
+                             vector_base<NumericT> & Ap,
+                             vector_base<NumericT> const & r0star,
+                             vector_base<NumericT> & inner_prod_buffer,
+                             vcl_size_t buffer_chunk_size,
+                             vcl_size_t buffer_chunk_offset)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+  cl_uint vec_size     = cl_uint(viennacl::traits::size(p));
+  cl_uint chunk_size   = cl_uint(buffer_chunk_size);
+  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+
+  Ap.clear();
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_coo_prod");
+  unsigned int thread_num = 256; //k.local_work_size(0);
+
+  k.local_work_size(0, thread_num);
+
+  k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+
+  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+                           p,
+                           Ap,
+                           r0star,
+                           vec_size,
+                           viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
+                           viennacl::ocl::local_mem(sizeof(NumericT)*thread_num),
+                           inner_prod_buffer, chunk_size, chunk_offset,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+                          ));
+}
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(ell_matrix<NumericT> const & A,
+                             vector_base<NumericT> const & p,
+                             vector_base<NumericT> & Ap,
+                             vector_base<NumericT> const & r0star,
+                             vector_base<NumericT> & inner_prod_buffer,
+                             vcl_size_t buffer_chunk_size,
+                             vcl_size_t buffer_chunk_offset)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+  cl_uint vec_size     = cl_uint(viennacl::traits::size(p));
+  cl_uint chunk_size   = cl_uint(buffer_chunk_size);
+  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_ell_prod");
+
+  unsigned int thread_num = 128;
+  unsigned int group_num = 128;
+
+  k.local_work_size(0, thread_num);
+  k.global_work_size(0, thread_num * group_num);
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+  {
+    k.local_work_size(0, 256);
+    k.global_work_size(0, 256*256);
+  }
+
+  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+                           A.handle().opencl_handle(),
+                           cl_uint(A.internal_size1()),
+                           cl_uint(A.maxnnz()),
+                           cl_uint(A.internal_maxnnz()),
+                           viennacl::traits::opencl_handle(p),
+                           viennacl::traits::opencl_handle(Ap),
+                           r0star,
+                           vec_size,
+                           inner_prod_buffer, chunk_size, chunk_offset,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+                          )
+                         );
+}
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(sliced_ell_matrix<NumericT> const & A,
+                             vector_base<NumericT> const & p,
+                             vector_base<NumericT> & Ap,
+                             vector_base<NumericT> const & r0star,
+                             vector_base<NumericT> & inner_prod_buffer,
+                             vcl_size_t buffer_chunk_size,
+                             vcl_size_t buffer_chunk_offset)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+  cl_uint vec_size     = cl_uint(viennacl::traits::size(p));
+  cl_uint chunk_size   = cl_uint(buffer_chunk_size);
+  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_sliced_ell_prod");
+
+  vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
+  unsigned int group_num = 256;
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+    thread_num = 256;
+
+  k.local_work_size(0, thread_num);
+  k.global_work_size(0, thread_num * group_num);
+
+  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
+                           A.handle2().opencl_handle(),
+                           A.handle3().opencl_handle(),
+                           A.handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(p),
+                           viennacl::traits::opencl_handle(Ap),
+                           r0star,
+                           vec_size,
+                           cl_uint(A.rows_per_block()),
+                           inner_prod_buffer, chunk_size, chunk_offset,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+                          )
+                        );
+}
+
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(hyb_matrix<NumericT> const & A,
+                             vector_base<NumericT> const & p,
+                             vector_base<NumericT> & Ap,
+                             vector_base<NumericT> const & r0star,
+                             vector_base<NumericT> & inner_prod_buffer,
+                             vcl_size_t buffer_chunk_size,
+                             vcl_size_t buffer_chunk_offset)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+  cl_uint vec_size     = cl_uint(viennacl::traits::size(p));
+  cl_uint chunk_size   = cl_uint(buffer_chunk_size);
+  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_hyb_prod");
+
+  unsigned int thread_num = 256;
+  unsigned int group_num = 128;
+
+  k.local_work_size(0, thread_num);
+  k.global_work_size(0, thread_num * group_num);
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+  {
+    k.local_work_size(0, 256);
+    k.global_work_size(0, 256*256);
+  }
+
+  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+                           A.handle().opencl_handle(),
+                           A.handle3().opencl_handle(),
+                           A.handle4().opencl_handle(),
+                           A.handle5().opencl_handle(),
+                           cl_uint(A.internal_size1()),
+                           cl_uint(A.ell_nnz()),
+                           cl_uint(A.internal_ellnnz()),
+                           viennacl::traits::opencl_handle(p),
+                           viennacl::traits::opencl_handle(Ap),
+                           r0star,
+                           vec_size,
+                           inner_prod_buffer, chunk_size, chunk_offset,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+                          )
+                        );
+}
+
+///////////////////////////////////
+
+/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
+  *
+  * This routines computes for vectors 'r', 'v_k':
+  *   Second reduction step for ||v_k||
+  *   v_k /= ||v_k||
+  *   First reduction step for <r, v_k>
+  */
+template <typename T>
+void pipelined_gmres_normalize_vk(vector_base<T> & v_k,
+                                  vector_base<T> const & residual,
+                                  vector_base<T> & R_buffer,
+                                  vcl_size_t offset_in_R,
+                                  vector_base<T> const & inner_prod_buffer,
+                                  vector_base<T> & r_dot_vk_buffer,
+                                  vcl_size_t buffer_chunk_size,
+                                  vcl_size_t buffer_chunk_offset)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(v_k).context());
+  viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_normalize_vk");
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, 128*128);
+
+  cl_uint size_vk      = cl_uint(v_k.size());
+  cl_uint vk_offset    = cl_uint(viennacl::traits::start(v_k));
+  cl_uint R_offset     = cl_uint(offset_in_R);
+  cl_uint chunk_size   = cl_uint(buffer_chunk_size);
+  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+  viennacl::ocl::enqueue(k(v_k, vk_offset,
+                           residual,
+                           R_buffer, R_offset,
+                           inner_prod_buffer, chunk_size,
+                           r_dot_vk_buffer, chunk_offset,
+                           size_vk,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(T))
+                           ));
+}
+
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
+                                         vcl_size_t v_k_size,
+                                         vcl_size_t v_k_internal_size,
+                                         vcl_size_t param_k,
+                                         vector_base<T> & vi_in_vk_buffer,
+                                         vcl_size_t buffer_chunk_size)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context());
+  viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_gram_schmidt_1");
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, 128*128);
+
+  cl_uint size_vk          = cl_uint(v_k_size);
+  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
+  cl_uint ocl_k            = cl_uint(param_k);
+  cl_uint chunk_size = cl_uint(buffer_chunk_size);
+  viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k,
+                           vi_in_vk_buffer, chunk_size
+                           ));
+}
+
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis,
+                                         vcl_size_t v_k_size,
+                                         vcl_size_t v_k_internal_size,
+                                         vcl_size_t param_k,
+                                         vector_base<T> const & vi_in_vk_buffer,
+                                         vector_base<T> & R_buffer,
+                                         vcl_size_t krylov_dim,
+                                         vector_base<T> & inner_prod_buffer,
+                                         vcl_size_t buffer_chunk_size)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context());
+  viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_gram_schmidt_2");
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, 128*128);
+
+  cl_uint size_vk          = cl_uint(v_k_size);
+  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
+  cl_uint ocl_k            = cl_uint(param_k);
+  cl_uint chunk_size       = cl_uint(buffer_chunk_size);
+  cl_uint ocl_krylov_dim   = cl_uint(krylov_dim);
+  viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k,
+                           vi_in_vk_buffer, chunk_size,
+                           R_buffer, ocl_krylov_dim,
+                           inner_prod_buffer,
+                           viennacl::ocl::local_mem(7 * k.local_work_size() * sizeof(T))
+                           ));
+}
+
+template <typename T>
+void pipelined_gmres_update_result(vector_base<T> & result,
+                                   vector_base<T> const & residual,
+                                   vector_base<T> const & krylov_basis,
+                                   vcl_size_t v_k_size,
+                                   vcl_size_t v_k_internal_size,
+                                   vector_base<T> const & coefficients,
+                                   vcl_size_t param_k)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context());
+  viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_update_result");
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, 128*128);
+
+  cl_uint size_vk          = cl_uint(v_k_size);
+  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
+  cl_uint ocl_k            = cl_uint(param_k);
+  viennacl::ocl::enqueue(k(result,
+                           residual,
+                           krylov_basis, size_vk, internal_size_vk,
+                           coefficients, ocl_k
+                           ));
+}
+
+
+template <typename T>
+void pipelined_gmres_prod(compressed_matrix<T> const & A,
+                          vector_base<T> const & p,
+                          vector_base<T> & Ap,
+                          vector_base<T> & inner_prod_buffer)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+  bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0));
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), use_nvidia_blocked ? "gmres_csr_blocked_prod" : "gmres_csr_prod");
+
+  cl_uint vec_size               = cl_uint(viennacl::traits::size(p));
+  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+  cl_uint start_p                = cl_uint(viennacl::traits::start(p));
+  cl_uint start_Ap               = cl_uint(viennacl::traits::start(Ap));
+
+  k.local_work_size(0, 128);
+  k.global_work_size(0, 128*128);
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+  {
+    k.local_work_size(0, 256);
+    k.global_work_size(0, 256*128);
+  }
+
+  if (use_nvidia_blocked)
+  {
+    viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+                             p, start_p,
+                             Ap, start_Ap,
+                             vec_size,
+                             inner_prod_buffer,
+                             buffer_size_per_vector,
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(T))
+                            ));
+  }
+  else
+  {
+    viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+                             p, start_p,
+                             Ap, start_Ap,
+                             vec_size,
+                             inner_prod_buffer,
+                             buffer_size_per_vector,
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+                             viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+                             viennacl::ocl::local_mem(1024 * sizeof(T))
+                            ));
+  }
+}
+
+template <typename T>
+void pipelined_gmres_prod(coordinate_matrix<T> const & A,
+                          vector_base<T> const & p,
+                          vector_base<T> & Ap,
+                          vector_base<T> & inner_prod_buffer)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+  cl_uint vec_size               = cl_uint(viennacl::traits::size(p));
+  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+  cl_uint start_p                = cl_uint(viennacl::traits::start(p));
+  cl_uint start_Ap               = cl_uint(viennacl::traits::start(Ap));
+
+  Ap.clear();
+  inner_prod_buffer.clear();
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_coo_prod");
+  unsigned int thread_num = 128; //k.local_work_size(0);
+
+  k.local_work_size(0, thread_num);
+
+  k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+
+  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+                           p, start_p,
+                           Ap, start_Ap,
+                           vec_size,
+                           viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
+                           viennacl::ocl::local_mem(sizeof(T)*thread_num),
+                           inner_prod_buffer,
+                           buffer_size_per_vector,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(T))
+                          ));
+}
+
+template <typename T>
+void pipelined_gmres_prod(ell_matrix<T> const & A,
+                          vector_base<T> const & p,
+                          vector_base<T> & Ap,
+                          vector_base<T> & inner_prod_buffer)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+  cl_uint vec_size               = cl_uint(viennacl::traits::size(p));
+  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+  cl_uint start_p                = cl_uint(viennacl::traits::start(p));
+  cl_uint start_Ap               = cl_uint(viennacl::traits::start(Ap));
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_ell_prod");
+
+  unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128;
+  unsigned int group_num = 128;
+
+  k.local_work_size(0, thread_num);
+  k.global_work_size(0, thread_num * group_num);
+
+  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+                           A.handle().opencl_handle(),
+                           cl_uint(A.internal_size1()),
+                           cl_uint(A.maxnnz()),
+                           cl_uint(A.internal_maxnnz()),
+                           viennacl::traits::opencl_handle(p), start_p,
+                           viennacl::traits::opencl_handle(Ap), start_Ap,
+                           vec_size,
+                           inner_prod_buffer,
+                           buffer_size_per_vector,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(T))
+                          )
+                         );
+}
+
+template <typename T>
+void pipelined_gmres_prod(sliced_ell_matrix<T> const & A,
+                          vector_base<T> const & p,
+                          vector_base<T> & Ap,
+                          vector_base<T> & inner_prod_buffer)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+  cl_uint vec_size               = cl_uint(viennacl::traits::size(p));
+  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+  cl_uint start_p                = cl_uint(viennacl::traits::start(p));
+  cl_uint start_Ap               = cl_uint(viennacl::traits::start(Ap));
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_sliced_ell_prod");
+
+  vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
+  unsigned int group_num = 128;
+
+  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+    thread_num = 256;
+
+  k.local_work_size(0, thread_num);
+  k.global_work_size(0, thread_num * group_num);
+
+  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
+                           A.handle2().opencl_handle(),
+                           A.handle3().opencl_handle(),
+                           A.handle().opencl_handle(),
+                           viennacl::traits::opencl_handle(p), start_p,
+                           viennacl::traits::opencl_handle(Ap), start_Ap,
+                           vec_size,
+                           cl_uint(A.rows_per_block()),
+                           inner_prod_buffer,
+                           buffer_size_per_vector,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(T))
+                          )
+                        );
+}
+
+
+template <typename T>
+void pipelined_gmres_prod(hyb_matrix<T> const & A,
+                          vector_base<T> const & p,
+                          vector_base<T> & Ap,
+                          vector_base<T> & inner_prod_buffer)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+  viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+  cl_uint vec_size               = cl_uint(viennacl::traits::size(p));
+  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+  cl_uint start_p                = cl_uint(viennacl::traits::start(p));
+  cl_uint start_Ap               = cl_uint(viennacl::traits::start(Ap));
+
+  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_hyb_prod");
+
+  unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128;
+  unsigned int group_num = 128;
+
+  k.local_work_size(0, thread_num);
+  k.global_work_size(0, thread_num * group_num);
+
+
+  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+                           A.handle().opencl_handle(),
+                           A.handle3().opencl_handle(),
+                           A.handle4().opencl_handle(),
+                           A.handle5().opencl_handle(),
+                           cl_uint(A.internal_size1()),
+                           cl_uint(A.ell_nnz()),
+                           cl_uint(A.internal_ellnnz()),
+                           viennacl::traits::opencl_handle(p), start_p,
+                           viennacl::traits::opencl_handle(Ap), start_Ap,
+                           vec_size,
+                           inner_prod_buffer,
+                           buffer_size_per_vector,
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+                           viennacl::ocl::local_mem(k.local_work_size() * sizeof(T))
+                          )
+                        );
+}
+
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp
new file mode 100644
index 0000000..b0252d7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp
@@ -0,0 +1,393 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_AMG_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_AMG_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/amg.hpp
+ *  @brief OpenCL kernel file for operations related to algebraic multigrid */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+
+template<typename StringT>
+void generate_amg_influence_trivial(StringT & source)
+{
+
+ source.append("__kernel void amg_influence_trivial( \n");
+ source.append("  __global const unsigned int * A_row_indices, \n");
+ source.append("  __global const unsigned int * A_col_indices, \n");
+ source.append("  unsigned int A_size1, \n");
+ source.append("  unsigned int A_nnz, \n");
+ source.append("  __global unsigned int * influences_row, \n");
+ source.append("  __global unsigned int * influences_id, \n");
+ source.append("  __global unsigned int * influences_values) { \n");
+
+ source.append("  for (unsigned int i = get_global_id(0); i < A_size1; i += get_global_size(0)) \n");
+ source.append("  { \n");
+ source.append("    unsigned int tmp = A_row_indices[i]; \n");
+ source.append("    influences_row[i] = tmp; \n");
+ source.append("    influences_values[i] = A_row_indices[i+1] - tmp; \n");
+ source.append("  } \n");
+
+ source.append("  for (unsigned int i = get_global_id(0); i < A_nnz; i += get_global_size(0)) \n");
+ source.append("    influences_id[i] = A_col_indices[i]; \n");
+
+ source.append("  if (get_global_id(0) == 0) \n");
+ source.append("    influences_row[A_size1] = A_row_indices[A_size1]; \n");
+ source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_amg_pmis2_init_workdata(StringT & source)
+{
+
+ source.append("__kernel void amg_pmis2_init_workdata( \n");
+ source.append("  __global unsigned int       *work_state, \n");
+ source.append("  __global unsigned int       *work_random, \n");
+ source.append("  __global unsigned int       *work_index, \n");
+ source.append("  __global unsigned int const *point_types, \n");
+ source.append("  __global unsigned int const *random_weights, \n");
+ source.append("  unsigned int size) { \n");
+
+ source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append("    switch (point_types[i]) { \n");
+ source.append("    case 0:  work_state[i] = 1; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED
+ source.append("    case 1:  work_state[i] = 2; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE
+ source.append("    case 2:  work_state[i] = 0; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE
+
+ source.append("    default: break; // do nothing \n");
+ source.append("    } \n");
+
+ source.append("    work_random[i] = random_weights[i]; \n");
+ source.append("    work_index[i]  = i; \n");
+ source.append("  } \n");
+ source.append("} \n");
+}
+
+
+
+template<typename StringT>
+void generate_amg_pmis2_max_neighborhood(StringT & source)
+{
+
+ source.append("__kernel void amg_pmis2_max_neighborhood( \n");
+ source.append("  __global unsigned int       *work_state, \n");
+ source.append("  __global unsigned int       *work_random, \n");
+ source.append("  __global unsigned int       *work_index, \n");
+ source.append("  __global unsigned int       *work_state2, \n");
+ source.append("  __global unsigned int       *work_random2, \n");
+ source.append("  __global unsigned int       *work_index2, \n");
+ source.append("  __global unsigned int const *influences_row, \n");
+ source.append("  __global unsigned int const *influences_id, \n");
+ source.append("  unsigned int size) { \n");
+
+ source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+
+ // load
+ source.append("    unsigned int state  = work_state[i]; \n");
+ source.append("    unsigned int random = work_random[i]; \n");
+ source.append("    unsigned int index  = work_index[i]; \n");
+
+ // max
+ source.append("    unsigned int j_stop = influences_row[i + 1]; \n");
+ source.append("    for (unsigned int j = influences_row[i]; j < j_stop; ++j) { \n");
+ source.append("      unsigned int influenced_point_id = influences_id[j]; \n");
+
+ // lexigraphical triple-max (not particularly pretty, but does the job):
+ source.append("      if (state < work_state[influenced_point_id]) { \n");
+ source.append("        state  = work_state[influenced_point_id]; \n");
+ source.append("        random = work_random[influenced_point_id]; \n");
+ source.append("        index  = work_index[influenced_point_id]; \n");
+ source.append("      } else if (state == work_state[influenced_point_id]) { \n");
+ source.append("        if (random < work_random[influenced_point_id]) { \n");
+ source.append("          state  = work_state[influenced_point_id]; \n");
+ source.append("          random = work_random[influenced_point_id]; \n");
+ source.append("          index  = work_index[influenced_point_id]; \n");
+ source.append("        } else if (random == work_random[influenced_point_id]) { \n");
+ source.append("          if (index < work_index[influenced_point_id]) { \n");
+ source.append("            state  = work_state[influenced_point_id]; \n");
+ source.append("            random = work_random[influenced_point_id]; \n");
+ source.append("            index  = work_index[influenced_point_id]; \n");
+ source.append("          } \n");
+ source.append("        } \n");
+ source.append("      } \n");
+
+ source.append("    }\n"); //for
+
+ // store
+ source.append("    work_state2[i]  = state; \n");
+ source.append("    work_random2[i] = random; \n");
+ source.append("    work_index2[i]  = index; \n");
+ source.append("  } \n");
+ source.append("} \n");
+}
+
+
+
+template<typename StringT>
+void generate_amg_pmis2_mark_mis_nodes(StringT & source)
+{
+
+ source.append("__kernel void amg_pmis2_mark_mis_nodes( \n");
+ source.append("  __global unsigned int const *work_state, \n");
+ source.append("  __global unsigned int const *work_index, \n");
+ source.append("  __global unsigned int       *point_types, \n");
+ source.append("  __global unsigned int       *undecided_buffer, \n");
+ source.append("  unsigned int size) { \n");
+
+ source.append("  unsigned int num_undecided = 0; \n");
+ source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append("    unsigned int max_state  = work_state[i]; \n");
+ source.append("    unsigned int max_index  = work_index[i]; \n");
+
+ source.append("    if (point_types[i] == 0) { \n");                     // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED
+ source.append("      if      (i == max_index) point_types[i] = 1; \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE
+ source.append("      else if (max_state == 2) point_types[i] = 2; \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE
+ source.append("      else                     num_undecided += 1; \n");
+ source.append("    } \n");
+ source.append("  } \n");
+
+ // reduction in shared memory:
+ source.append("  __local unsigned int shared_buffer[256]; \n");
+ source.append("  shared_buffer[get_local_id(0)] = num_undecided; \n");
+ source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
+ source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append("    if (get_local_id(0) < stride) shared_buffer[get_local_id(0)] += shared_buffer[get_local_id(0)+stride]; \n");
+ source.append("  } \n");
+
+ source.append("  if (get_local_id(0) == 0) \n");
+ source.append("    undecided_buffer[get_group_id(0)] = shared_buffer[0]; \n");
+
+ source.append("} \n");
+}
+
+
+template<typename StringT>
+void generate_amg_pmis2_reset_state(StringT & source)
+{
+
+ source.append("__kernel void amg_pmis2_reset_state( \n");
+ source.append("  __global unsigned int *point_types, \n");
+ source.append("  unsigned int size) { \n");
+
+ source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append("    if (point_types[i] != 1) point_types[i] = 0;\n"); // mind mapping of POINT_TYPE_COARSE and POINT_TYPE_UNDECIDED
+ source.append("  } \n");
+
+ source.append("} \n");
+}
+
+
+
+//////////////
+
+
+
+template<typename StringT>
+void generate_amg_agg_propagate_coarse_indices(StringT & source)
+{
+
+ source.append(" __kernel void amg_agg_propagate_coarse_indices( \n");
+ source.append("  __global unsigned int       *point_types, \n");
+ source.append("  __global unsigned int       *coarse_ids, \n");
+ source.append("  __global unsigned int const *influences_row, \n");
+ source.append("  __global unsigned int const *influences_id, \n");
+ source.append("  unsigned int size) { \n");
+
+ source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append("  { \n");
+ source.append("    if (point_types[i] == 1) { \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE
+ source.append("      unsigned int coarse_index = coarse_ids[i]; \n");
+
+ source.append("      unsigned int j_stop = influences_row[i + 1]; \n");
+ source.append("      for (unsigned int j = influences_row[i]; j < j_stop; ++j) { \n");
+ source.append("        unsigned int influenced_point_id = influences_id[j]; \n");
+ source.append("        coarse_ids[influenced_point_id] = coarse_index; \n");
+ source.append("        if (influenced_point_id != i) point_types[influenced_point_id] = 2; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE
+ source.append("      } \n");
+ source.append("    } \n");
+ source.append("  } \n");
+ source.append("} \n");
+
+}
+
+
+
+template<typename StringT>
+void generate_amg_agg_merge_undecided(StringT & source)
+{
+
+ source.append(" __kernel void amg_agg_merge_undecided( \n");
+ source.append("  __global unsigned int       *point_types, \n");
+ source.append("  __global unsigned int       *coarse_ids, \n");
+ source.append("  __global unsigned int const *influences_row, \n");
+ source.append("  __global unsigned int const *influences_id, \n");
+ source.append("  unsigned int size) { \n");
+
+ source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append("  { \n");
+ source.append("    if (point_types[i] == 0) { \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED
+
+ source.append("      unsigned int j_stop = influences_row[i + 1]; \n");
+ source.append("      for (unsigned int j = influences_row[i]; j < j_stop; ++j) { \n");
+ source.append("        unsigned int influenced_point_id = influences_id[j]; \n");
+ source.append("        if (point_types[influenced_point_id] != 0) { \n");       // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED
+ source.append("          coarse_ids[i] = coarse_ids[influenced_point_id]; \n");
+ source.append("          break; \n");
+ source.append("        } \n");
+ source.append("      } \n");
+
+ source.append("    } \n");
+ source.append("  } \n");
+ source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_amg_agg_merge_undecided_2(StringT & source)
+{
+
+ source.append(" __kernel void amg_agg_merge_undecided_2( \n");
+ source.append("  __global unsigned int *point_types, \n");
+ source.append("  unsigned int size) { \n");
+
+ source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append("    if (point_types[i] == 0) point_types[i] = 2; \n"); // POINT_TYPE_UNDECIDED to POINT_TYPE_FINE
+
+ source.append("} \n");
+}
+
+//////////////////////
+
+template<typename StringT>
+void generate_amg_interpol_ag(StringT & source, std::string const & numeric_string)
+{
+
+ source.append(" __kernel void amg_interpol_ag( \n");
+ source.append("  __global unsigned int * P_row_indices, \n");
+ source.append("  __global unsigned int * P_column_indices, \n");
+ source.append("  __global "); source.append(numeric_string); source.append(" * P_elements, \n");
+ source.append("  __global const unsigned int * coarse_agg_ids, \n");
+ source.append("  unsigned int size) { \n");
+
+ source.append("   for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append("   { \n");
+ source.append("     P_row_indices[i] = i; \n");
+ source.append("     P_column_indices[i] = coarse_agg_ids[i]; \n");
+ source.append("     P_elements[i] = 1; \n");
+ source.append("   } \n");
+ source.append("   if (get_global_id(0) == 0) P_row_indices[size] = size; \n");
+ source.append("  } \n");
+
+}
+
+template<typename StringT>
+void generate_amg_interpol_sa(StringT & source, std::string const & numeric_string)
+{
+
+ source.append("__kernel void amg_interpol_sa( \n");
+ source.append(" __global unsigned int const *A_row_indices, \n");
+ source.append(" __global unsigned int const *A_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const *A_elements, \n");
+ source.append(" unsigned int A_size1, \n");
+ source.append(" unsigned int A_nnz, \n");
+ source.append(" __global unsigned int *Jacobi_row_indices, \n");
+ source.append(" __global unsigned int *Jacobi_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *Jacobi_elements, \n");
+ source.append(" "); source.append(numeric_string); source.append(" omega) { \n");
+
+ source.append("  for (unsigned int row = get_global_id(0); row < A_size1; row += get_global_size(0)) \n");
+ source.append("  { \n");
+ source.append("    unsigned int row_begin = A_row_indices[row]; \n");
+ source.append("    unsigned int row_end   = A_row_indices[row+1]; \n");
+
+ source.append("    Jacobi_row_indices[row] = row_begin; \n");
+
+ // Step 1: Extract diagonal:
+ source.append("    "); source.append(numeric_string); source.append(" diag = 0; \n");
+ source.append("    for (unsigned int j = row_begin; j < row_end; ++j) { \n");
+ source.append("      if (A_col_indices[j] == row) { \n");
+ source.append("        diag = A_elements[j]; \n");
+ source.append("        break; \n");
+ source.append("      } \n");
+ source.append("    } \n");
+
+ // Step 2: Write entries:
+ source.append("    for (unsigned int j = row_begin; j < row_end; ++j) { \n");
+ source.append("      unsigned int col_index = A_col_indices[j]; \n");
+ source.append("      Jacobi_col_indices[j] = col_index; \n");
+ source.append("      Jacobi_elements[j] = (col_index == row) ? (1 - omega) : (-omega * A_elements[j] / diag); \n");
+ source.append("    } \n");
+
+ source.append("  } \n");
+ source.append("  if (get_global_id(0) == 0) Jacobi_row_indices[A_size1] = A_nnz; \n");
+ source.append("} \n");
+
+}
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for compressed_matrix. */
+template<typename NumericT>
+struct amg
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_amg";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(2048);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      generate_amg_influence_trivial(source);
+      generate_amg_pmis2_init_workdata(source);
+      generate_amg_pmis2_max_neighborhood(source);
+      generate_amg_pmis2_mark_mis_nodes(source);
+      generate_amg_pmis2_reset_state(source);
+      generate_amg_agg_propagate_coarse_indices(source);
+      generate_amg_agg_merge_undecided(source);
+      generate_amg_agg_merge_undecided_2(source);
+
+      generate_amg_interpol_ag(source, numeric_string);
+      generate_amg_interpol_sa(source, numeric_string);
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+


[50/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cpp
new file mode 100644
index 0000000..a7319d5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cpp
@@ -0,0 +1,420 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_vector.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLiamax(ViennaCLInt *index, ViennaCLVector x)
+{
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLasum(ViennaCLHostScalar *alpha, ViennaCLVector x)
+{
+  if ((*alpha)->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      (*alpha)->value_float = viennacl::linalg::norm_1(v1);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      (*alpha)->value_double = viennacl::linalg::norm_1(v1);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLaxpy(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y)
+{
+  if (alpha->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      v2 += alpha->value_float * v1;
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      v2 += alpha->value_double * v1;
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLcopy(ViennaCLVector x, ViennaCLVector y)
+{
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      v2 = v1;
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      v2 = v1;
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLdot(ViennaCLHostScalar *alpha, ViennaCLVector x, ViennaCLVector y)
+{
+  if ((*alpha)->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      (*alpha)->value_float = viennacl::linalg::inner_prod(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      (*alpha)->value_double = viennacl::linalg::inner_prod(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLnrm2(ViennaCLHostScalar *alpha, ViennaCLVector x)
+{
+  if ((*alpha)->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      (*alpha)->value_float = viennacl::linalg::norm_2(v1);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      (*alpha)->value_double = viennacl::linalg::norm_2(v1);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLrot(ViennaCLVector     x, ViennaCLVector     y,
+                                                      ViennaCLHostScalar c, ViennaCLHostScalar s)
+{
+  if (c->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (s->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      viennacl::linalg::plane_rotation(v1, v2, c->value_float, s->value_float);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      viennacl::linalg::plane_rotation(v1, v2, c->value_double, s->value_double);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLscal(ViennaCLHostScalar alpha, ViennaCLVector x)
+{
+  if (alpha->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      v1 *= alpha->value_float;
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      v1 *= alpha->value_double;
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLswap(ViennaCLVector x, ViennaCLVector y)
+{
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      viennacl::swap(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      viennacl::swap(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cu
new file mode 100644
index 0000000..a7319d5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cu
@@ -0,0 +1,420 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_vector.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLiamax(ViennaCLInt *index, ViennaCLVector x)
+{
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLasum(ViennaCLHostScalar *alpha, ViennaCLVector x)
+{
+  if ((*alpha)->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      (*alpha)->value_float = viennacl::linalg::norm_1(v1);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      (*alpha)->value_double = viennacl::linalg::norm_1(v1);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLaxpy(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y)
+{
+  if (alpha->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      v2 += alpha->value_float * v1;
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      v2 += alpha->value_double * v1;
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLcopy(ViennaCLVector x, ViennaCLVector y)
+{
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      v2 = v1;
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      v2 = v1;
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLdot(ViennaCLHostScalar *alpha, ViennaCLVector x, ViennaCLVector y)
+{
+  if ((*alpha)->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      (*alpha)->value_float = viennacl::linalg::inner_prod(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      (*alpha)->value_double = viennacl::linalg::inner_prod(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLnrm2(ViennaCLHostScalar *alpha, ViennaCLVector x)
+{
+  if ((*alpha)->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      (*alpha)->value_float = viennacl::linalg::norm_2(v1);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      (*alpha)->value_double = viennacl::linalg::norm_2(v1);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLrot(ViennaCLVector     x, ViennaCLVector     y,
+                                                      ViennaCLHostScalar c, ViennaCLHostScalar s)
+{
+  if (c->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (s->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      viennacl::linalg::plane_rotation(v1, v2, c->value_float, s->value_float);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      viennacl::linalg::plane_rotation(v1, v2, c->value_double, s->value_double);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLscal(ViennaCLHostScalar alpha, ViennaCLVector x)
+{
+  if (alpha->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      v1 *= alpha->value_float;
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+      v1 *= alpha->value_double;
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLswap(ViennaCLVector x, ViennaCLVector y)
+{
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      typedef viennacl::vector_base<float>::size_type     difference_type;
+      viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      viennacl::swap(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      typedef viennacl::vector_base<double>::size_type     difference_type;
+      viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+      viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+      viennacl::swap(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1_cuda.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1_cuda.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_cuda.cu
new file mode 100644
index 0000000..e6dddbb
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_cuda.cu
@@ -0,0 +1,264 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+
+#ifdef VIENNACL_WITH_CUDA
+
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAiSamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                             ViennaCLInt *index,
+                                                             float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAiDamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                             ViennaCLInt *index,
+                                                             double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAScopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADcopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           float *alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           double *alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float c, float s)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double c, double s)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+#endif
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cpp
new file mode 100644
index 0000000..07a5097
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cpp
@@ -0,0 +1,293 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiSamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                             ViennaCLInt *index,
+                                                             float *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiDamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                             ViennaCLInt *index,
+                                                             double *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostScopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDcopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           float *alpha,
+                                                           float *x, ViennaCLInt offx, int incx,
+                                                           float *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           double *alpha,
+                                                           double *x, ViennaCLInt offx, int incx,
+                                                           double *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           float *x, ViennaCLInt offx, int incx,
+                                                           float *y, ViennaCLInt offy, int incy,
+                                                           float c, float s)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           double *x, ViennaCLInt offx, int incx,
+                                                           double *y, ViennaCLInt offy, int incy,
+                                                           double c, double s)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cu
new file mode 100644
index 0000000..07a5097
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cu
@@ -0,0 +1,293 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiSamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                             ViennaCLInt *index,
+                                                             float *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiDamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                             ViennaCLInt *index,
+                                                             double *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostScopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDcopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           float *alpha,
+                                                           float *x, ViennaCLInt offx, int incx,
+                                                           float *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           double *alpha,
+                                                           double *x, ViennaCLInt offx, int incx,
+                                                           double *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           float *x, ViennaCLInt offx, int incx,
+                                                           float *y, ViennaCLInt offy, int incy,
+                                                           float c, float s)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           double *x, ViennaCLInt offx, int incx,
+                                                           double *y, ViennaCLInt offy, int incy,
+                                                           double c, double s)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cpp
new file mode 100644
index 0000000..617b128
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cpp
@@ -0,0 +1,297 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiSamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                               ViennaCLInt *index,
+                                                               cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiDamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                               ViennaCLInt *index,
+                                                               cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLScopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDcopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             float *alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             double *alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             float c, float s)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             double c, double s)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<float>::size_type           size_type;
+  typedef viennacl::vector_base<float>::size_type           difference_type;
+  viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  typedef viennacl::vector_base<double>::size_type           size_type;
+  typedef viennacl::vector_base<double>::size_type           difference_type;
+  viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+#endif


[06/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scalar.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scalar.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scalar.hpp
new file mode 100644
index 0000000..dcd39ad
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scalar.hpp
@@ -0,0 +1,283 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SCALAR_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_SCALAR_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/scalar.hpp
+ *  @brief OpenCL kernel file for scalar operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+/** @brief Enumeration for the scalar type in avbv-like operations */
+enum asbs_scalar_type
+{
+  VIENNACL_ASBS_NONE = 0, // scalar does not exist/contribute
+  VIENNACL_ASBS_CPU,
+  VIENNACL_ASBS_GPU
+};
+
+/** @brief Configuration struct for generating OpenCL kernels for linear combinations of viennacl::scalar<> objects */
+struct asbs_config
+{
+  asbs_config() : with_stride_and_range(true), a(VIENNACL_ASBS_CPU), b(VIENNACL_ASBS_NONE) {}
+
+  bool with_stride_and_range;
+  std::string      assign_op;
+  asbs_scalar_type a;
+  asbs_scalar_type b;
+};
+
+// just returns the assignment string
+template<typename StringT>
+void generate_asbs_impl3(StringT & source, char sign_a, char sign_b, asbs_config const & cfg, bool mult_alpha, bool mult_beta)
+{
+  source.append("      *s1 "); source.append(cfg.assign_op); source.append(1, sign_a); source.append(" *s2 ");
+  if (mult_alpha)
+    source.append("* alpha ");
+  else
+    source.append("/ alpha ");
+  if (cfg.b != VIENNACL_ASBS_NONE)
+  {
+    source.append(1, sign_b); source.append(" *s3 ");
+    if (mult_beta)
+      source.append("* beta");
+    else
+      source.append("/ beta");
+  }
+  source.append("; \n");
+}
+
+template<typename StringT>
+void generate_asbs_impl2(StringT & source, char sign_a, char sign_b, asbs_config const & cfg)
+{
+  source.append("    if (options2 & (1 << 1)) { \n");
+  if (cfg.b != VIENNACL_ASBS_NONE)
+  {
+    source.append("     if (options3 & (1 << 1)) \n");
+    generate_asbs_impl3(source, sign_a, sign_b, cfg, false, false);
+    source.append("     else \n");
+    generate_asbs_impl3(source, sign_a, sign_b, cfg, false, true);
+  }
+  else
+    generate_asbs_impl3(source, sign_a, sign_b, cfg, false, true);
+  source.append("    } else { \n");
+  if (cfg.b != VIENNACL_ASBS_NONE)
+  {
+    source.append("     if (options3 & (1 << 1)) \n");
+    generate_asbs_impl3(source, sign_a, sign_b, cfg, true, false);
+    source.append("     else \n");
+    generate_asbs_impl3(source, sign_a, sign_b, cfg, true, true);
+  }
+  else
+    generate_asbs_impl3(source, sign_a, sign_b, cfg, true, true);
+  source.append("    } \n");
+
+}
+
+template<typename StringT>
+void generate_asbs_impl(StringT & source, std::string const & numeric_string, asbs_config const & cfg)
+{
+  source.append("__kernel void as");
+  if (cfg.b != VIENNACL_ASBS_NONE)
+    source.append("bs");
+  if (cfg.assign_op != "=")
+    source.append("_s");
+
+  if (cfg.a == VIENNACL_ASBS_CPU)
+    source.append("_cpu");
+  else if (cfg.a == VIENNACL_ASBS_GPU)
+    source.append("_gpu");
+
+  if (cfg.b == VIENNACL_ASBS_CPU)
+    source.append("_cpu");
+  else if (cfg.b == VIENNACL_ASBS_GPU)
+    source.append("_gpu");
+  source.append("( \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * s1, \n");
+  source.append(" \n");
+  if (cfg.a == VIENNACL_ASBS_CPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" fac2, \n");
+  }
+  else if (cfg.a == VIENNACL_ASBS_GPU)
+  {
+    source.append("  __global "); source.append(numeric_string); source.append(" * fac2, \n");
+  }
+  source.append("  unsigned int options2, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+  source.append("  __global const "); source.append(numeric_string); source.append(" * s2");
+
+  if (cfg.b != VIENNACL_ASBS_NONE)
+  {
+    source.append(", \n\n");
+    if (cfg.b == VIENNACL_ASBS_CPU)
+    {
+      source.append("  "); source.append(numeric_string); source.append(" fac3, \n");
+    }
+    else if (cfg.b == VIENNACL_ASBS_GPU)
+    {
+      source.append("  __global "); source.append(numeric_string); source.append(" * fac3, \n");
+    }
+    source.append("  unsigned int options3, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+    source.append("  __global const "); source.append(numeric_string); source.append(" * s3");
+  }
+  source.append(") \n{ \n");
+
+  if (cfg.a == VIENNACL_ASBS_CPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" alpha = fac2; \n");
+  }
+  else if (cfg.a == VIENNACL_ASBS_GPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
+  }
+  source.append(" \n");
+
+  if (cfg.b == VIENNACL_ASBS_CPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" beta = fac3; \n");
+  }
+  else if (cfg.b == VIENNACL_ASBS_GPU)
+  {
+    source.append("  "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
+  }
+
+  source.append("  if (options2 & (1 << 0)) { \n");
+  if (cfg.b != VIENNACL_ASBS_NONE)
+  {
+    source.append("   if (options3 & (1 << 0)) { \n");
+    generate_asbs_impl2(source, '-', '-', cfg);
+    source.append("   } else { \n");
+    generate_asbs_impl2(source, '-', '+', cfg);
+    source.append("   } \n");
+  }
+  else
+    generate_asbs_impl2(source, '-', '+', cfg);
+  source.append("  } else { \n");
+  if (cfg.b != VIENNACL_ASBS_NONE)
+  {
+    source.append("   if (options3 & (1 << 0)) { \n");
+    generate_asbs_impl2(source, '+', '-', cfg);
+    source.append("   } else { \n");
+    generate_asbs_impl2(source, '+', '+', cfg);
+    source.append("   } \n");
+  }
+  else
+    generate_asbs_impl2(source, '+', '+', cfg);
+
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_asbs(StringT & source, std::string const & numeric_string)
+{
+  asbs_config cfg;
+  cfg.assign_op = "=";
+  cfg.with_stride_and_range = true;
+
+  // as
+  cfg.b = VIENNACL_ASBS_NONE; cfg.a = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+  cfg.b = VIENNACL_ASBS_NONE; cfg.a = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+
+  // asbs
+  cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+
+  // asbs
+  cfg.assign_op = "+=";
+
+  cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+  cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+}
+
+template<typename StringT>
+void generate_scalar_swap(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void swap( \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * s1, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * s2) \n");
+  source.append("{ \n");
+  source.append("  "); source.append(numeric_string); source.append(" tmp = *s2; \n");
+  source.append("  *s2 = *s1; \n");
+  source.append("  *s1 = tmp; \n");
+  source.append("} \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for operations involving viennacl::scalar<>, but not viennacl::vector<> or viennacl::matrix<>. */
+template<typename NumericT>
+struct scalar
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_scalar";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(8192);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // fully parametrized kernels:
+      generate_asbs(source, numeric_string);
+      generate_scalar_swap(source, numeric_string);
+
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scan.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scan.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scan.hpp
new file mode 100644
index 0000000..9626d2d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scan.hpp
@@ -0,0 +1,194 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SCAN_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_SCAN_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/scan.hpp
+ *  @brief OpenCL kernel file for scan operations. To be merged back to vector operations. */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+
+template <typename StringType>
+void generate_scan_kernel_1(StringType & source, std::string const & numeric_string)
+{
+  source.append("__kernel void scan_1(__global "); source.append(numeric_string); source.append("* X, \n");
+  source.append("                     unsigned int startX, \n");
+  source.append("                     unsigned int incX, \n");
+  source.append("                     unsigned int sizeX, \n");
+
+  source.append("                     __global "); source.append(numeric_string); source.append("* Y, \n");
+  source.append("                     unsigned int startY, \n");
+  source.append("                     unsigned int incY, \n");
+
+  source.append("                     unsigned int scan_offset, \n"); // 0 for inclusive scan, 1 for exclusive scan
+  source.append("                     __global "); source.append(numeric_string); source.append("* carries) { \n");
+
+  source.append("  __local "); source.append(numeric_string); source.append(" shared_buffer[256]; \n");
+  source.append("  "); source.append(numeric_string); source.append(" my_value; \n");
+
+  source.append("  unsigned int work_per_thread = (sizeX - 1) / get_global_size(0) + 1; \n");
+  source.append("  unsigned int block_start = work_per_thread * get_local_size(0) *  get_group_id(0); \n");
+  source.append("  unsigned int block_stop  = work_per_thread * get_local_size(0) * (get_group_id(0) + 1); \n");
+  source.append("  unsigned int block_offset = 0; \n");
+
+  // run scan on each section:
+  source.append("  for (unsigned int i = block_start + get_local_id(0); i < block_stop; i += get_local_size(0)) { \n");
+
+  // load data
+  source.append("    my_value = (i < sizeX) ? X[i * incX + startX] : 0; \n");
+
+  // inclusive scan in shared buffer:
+  source.append("    for(unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+  source.append("       barrier(CLK_LOCAL_MEM_FENCE);   \n");
+  source.append("       shared_buffer[get_local_id(0)] = my_value;   \n");
+  source.append("       barrier(CLK_LOCAL_MEM_FENCE);   \n");
+  source.append("       if (get_local_id(0) >= stride)   \n");
+  source.append("         my_value += shared_buffer[get_local_id(0) - stride];   \n");
+  source.append("    } \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE);   \n");
+  source.append("    shared_buffer[get_local_id(0)] = my_value;   \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE);   \n");
+
+  // write to output array:
+  source.append("    if (scan_offset > 0) \n");
+  source.append("      my_value = (get_local_id(0) > 0) ? shared_buffer[get_local_id(0) - 1] : 0; \n");
+
+  source.append("    if (i < sizeX) \n");
+  source.append("      Y[i * incY + startY] = block_offset + my_value; \n");
+
+  source.append("    block_offset += shared_buffer[get_local_size(0)-1]; \n");
+  source.append("  } \n");
+
+  // write carry:
+  source.append("  if (get_local_id(0) == 0) carries[get_group_id(0)] = block_offset; \n");
+
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_scan_kernel_2(StringType & source, std::string const & numeric_string)
+{
+  source.append("__kernel void scan_2(__global "); source.append(numeric_string); source.append("* carries) { \n");
+
+  source.append("  __local "); source.append(numeric_string); source.append(" shared_buffer[256]; \n");       //section size
+
+  // load data
+  source.append("  "); source.append(numeric_string); source.append(" my_carry = carries[get_local_id(0)]; \n");
+
+  // scan in shared buffer:
+  source.append("  for(unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+  source.append("     barrier(CLK_LOCAL_MEM_FENCE);   \n");
+  source.append("     shared_buffer[get_local_id(0)] = my_carry;   \n");
+  source.append("     barrier(CLK_LOCAL_MEM_FENCE);   \n");
+  source.append("     if (get_local_id(0) >= stride)   \n");
+  source.append("       my_carry += shared_buffer[get_local_id(0) - stride];   \n");
+  source.append("  } \n");
+  source.append("  barrier(CLK_LOCAL_MEM_FENCE);   \n");
+  source.append("  shared_buffer[get_local_id(0)] = my_carry;   \n");
+  source.append("  barrier(CLK_LOCAL_MEM_FENCE);   \n");
+
+  // write to output array:
+  source.append("  carries[get_local_id(0)] = (get_local_id(0) > 0) ? shared_buffer[get_local_id(0) - 1] : 0;  \n");
+
+  source.append("} \n");
+}
+
+template <typename StringType>
+void generate_scan_kernel_3(StringType & source, std::string const & numeric_string)
+{
+  source.append("__kernel void scan_3(__global "); source.append(numeric_string); source.append(" * Y, \n");
+  source.append("                     unsigned int startY, \n");
+  source.append("                     unsigned int incY, \n");
+  source.append("                     unsigned int sizeY, \n");
+
+  source.append("                     __global "); source.append(numeric_string); source.append("* carries) { \n");
+
+  source.append("  unsigned int work_per_thread = (sizeY - 1) / get_global_size(0) + 1; \n");
+  source.append("  unsigned int block_start = work_per_thread * get_local_size(0) *  get_group_id(0); \n");
+  source.append("  unsigned int block_stop  = work_per_thread * get_local_size(0) * (get_group_id(0) + 1); \n");
+
+  source.append("  __local "); source.append(numeric_string); source.append(" shared_offset; \n");
+
+  source.append("  if (get_local_id(0) == 0) shared_offset = carries[get_group_id(0)]; \n");
+  source.append("  barrier(CLK_LOCAL_MEM_FENCE);   \n");
+
+  source.append("  for (unsigned int i = block_start + get_local_id(0); i < block_stop; i += get_local_size(0)) \n");
+  source.append("    if (i < sizeY) \n");
+  source.append("      Y[i * incY + startY] += shared_offset; \n");
+
+  source.append("} \n");
+}
+
+
+
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for singular value decomposition of dense matrices. */
+template<typename NumericT>
+struct scan
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_scan";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(1024);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      generate_scan_kernel_1(source, numeric_string);
+      generate_scan_kernel_2(source, numeric_string);
+      generate_scan_kernel_3(source, numeric_string);
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp
new file mode 100644
index 0000000..562cb52
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp
@@ -0,0 +1,135 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SLICED_ELL_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_SLICED_ELL_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp
+ *  @brief OpenCL kernel file for sliced_ell_matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_sliced_ell_vec_mul(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+  if (with_alpha_beta)
+    source.append("__kernel void vec_mul_alpha_beta( \n");
+  else
+    source.append("__kernel void vec_mul( \n");
+  source.append("  __global const unsigned int * columns_per_block, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const unsigned int * block_start, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("  uint4 layout_x, \n");
+  if (with_alpha_beta) { source.append("  "); source.append(numeric_string); source.append(" alpha, \n"); }
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  uint4 layout_result, \n");
+  if (with_alpha_beta) { source.append("  "); source.append(numeric_string); source.append(" beta, \n"); }
+  source.append("  unsigned int block_size) \n");
+  source.append("{ \n");
+  source.append("  uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
+  source.append("  uint id_in_block = get_local_id(0) % block_size; \n");
+  source.append("  uint num_blocks  = (layout_result.z - 1) / block_size + 1; \n");
+  source.append("  uint global_warp_count  = blocks_per_workgroup * get_num_groups(0); \n");
+  source.append("  uint global_warp_id     = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
+
+  source.append("  for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
+  source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+  source.append("    uint row    = block_idx * block_size + id_in_block; \n");
+  source.append("    uint offset = block_start[block_idx]; \n");
+  source.append("    uint num_columns = columns_per_block[block_idx]; \n");
+  source.append("    for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
+  source.append("      uint index = offset + item_id * block_size + id_in_block; \n");
+  source.append("      "); source.append(numeric_string); source.append(" val = elements[index]; \n");
+  source.append("      sum += (val != 0) ? (x[column_indices[index] * layout_x.y + layout_x.x] * val) : 0; \n");
+  source.append("    } \n");
+
+  source.append("    if (row < layout_result.z) \n");
+  if (with_alpha_beta)
+    source.append("      result[row * layout_result.y + layout_result.x] = alpha * sum + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+  else
+    source.append("      result[row * layout_result.y + layout_result.x] = sum; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for ell_matrix. */
+template<typename NumericT, typename IndexT>
+struct sliced_ell_matrix;
+
+template<typename NumericT>
+struct sliced_ell_matrix<NumericT, unsigned int>
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + viennacl::ocl::type_to_string<unsigned int>::apply() + "_sliced_ell_matrix";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(1024);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      // fully parametrized kernels:
+      generate_sliced_ell_vec_mul(source, numeric_string, true);
+      generate_sliced_ell_vec_mul(source, numeric_string, false);
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/spai.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/spai.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/spai.hpp
new file mode 100644
index 0000000..19ac991
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/spai.hpp
@@ -0,0 +1,631 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/spai.hpp
+ *  @brief OpenCL kernel file for sparse approximate inverse operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_spai_assemble_blocks(StringT & source, std::string const & numeric_string)
+{
+  source.append("float get_element(__global const unsigned int * row_indices, \n");
+  source.append("           __global const unsigned int * column_indices, \n");
+  source.append("           __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("           unsigned int row, \n");
+  source.append("           unsigned int col) \n");
+  source.append("{ \n");
+  source.append("  unsigned int row_end = row_indices[row+1]; \n");
+  source.append("  for (unsigned int i = row_indices[row]; i < row_end; ++i){ \n");
+  source.append("    if (column_indices[i] == col) \n");
+  source.append("      return elements[i]; \n");
+  source.append("    if (column_indices[i] > col) \n");
+  source.append("      return 0; \n");
+  source.append("  } \n");
+  source.append("  return 0; \n");
+  source.append("} \n");
+
+  source.append("void block_assembly(__global const unsigned int * row_indices, \n");
+  source.append("          __global const unsigned int * column_indices, \n");
+  source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("          __global const unsigned int * matrix_dimensions, \n");
+  source.append("          __global const unsigned int * set_I, \n");
+  source.append("          __global const unsigned int * set_J, \n");
+  source.append("          unsigned int matrix_ind, \n");
+  source.append("          __global "); source.append(numeric_string); source.append(" * com_A_I_J) \n");
+  source.append("{ \n");
+  source.append("  unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
+  source.append("  unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
+
+  source.append("  for (unsigned int i = 0; i < col_n; ++i){ \n");
+          //start row index
+  source.append("        for (unsigned int j = 0; j < row_n; j++){ \n");
+  source.append("          com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]); \n");
+  source.append("        } \n");
+  source.append("      } \n");
+  source.append("} \n");
+
+  source.append("__kernel void assemble_blocks( \n");
+  source.append("          __global const unsigned int * row_indices, \n");
+  source.append("          __global const unsigned int * column_indices, \n");
+  source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("          __global const unsigned int * set_I, \n");
+  source.append("        __global const unsigned int * set_J, \n");
+  source.append("      __global const unsigned int * i_ind, \n");
+  source.append("      __global const unsigned int * j_ind, \n");
+  source.append("        __global const unsigned int * block_ind, \n");
+  source.append("        __global const unsigned int * matrix_dimensions, \n");
+  source.append("      __global "); source.append(numeric_string); source.append(" * com_A_I_J, \n");
+  source.append("      __global unsigned int * g_is_update, \n");
+  source.append("                   unsigned int  block_elems_num) \n");
+  source.append("{ \n");
+  source.append("    for (unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+  source.append("        if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+  source.append("            block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]); \n");
+  source.append("        } \n");
+  source.append("    } \n");
+  source.append("  } \n");
+}
+
+template<typename StringT>
+void generate_spai_block_bv_assembly(StringT & source, std::string const & numeric_string)
+{
+  source.append("  void assemble_bv(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n){ \n");
+  source.append("    for (unsigned int i = 0; i < col_n; ++i){ \n");
+  source.append("      g_bv_r[i] = g_bv[ i]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+
+  source.append("  void assemble_bv_block(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n, \n");
+  source.append("               __global "); source.append(numeric_string); source.append(" * g_bv_u, unsigned int col_n_u) \n");
+  source.append("  { \n");
+  source.append("    assemble_bv(g_bv_r, g_bv, col_n); \n");
+  source.append("    assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u); \n");
+  source.append("  } \n");
+
+  source.append("  __kernel void block_bv_assembly(__global "); source.append(numeric_string); source.append(" * g_bv, \n");
+  source.append("              __global unsigned int * start_bv_ind, \n");
+  source.append("              __global unsigned int * matrix_dimensions, \n");
+  source.append("              __global "); source.append(numeric_string); source.append(" * g_bv_u, \n");
+  source.append("              __global unsigned int * start_bv_u_ind, \n");
+  source.append("              __global unsigned int * matrix_dimensions_u, \n");
+  source.append("              __global "); source.append(numeric_string); source.append(" * g_bv_r, \n");
+  source.append("              __global unsigned int * start_bv_r_ind, \n");
+  source.append("              __global unsigned int * matrix_dimensions_r, \n");
+  source.append("              __global unsigned int * g_is_update, \n");
+  source.append("              //__local  "); source.append(numeric_string); source.append(" * local_gb, \n");
+  source.append("              unsigned int  block_elems_num) \n");
+  source.append("  { \n");
+  source.append("    for (unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+  source.append("      if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+  source.append("        assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]); \n");
+  source.append("      } \n");
+  source.append("    } \n");
+  source.append("  } \n");
+}
+
+template<typename StringT>
+void generate_spai_block_least_squares(StringT & source, std::string const & numeric_string)
+{
+  source.append("void custom_dot_prod_ls(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __global "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n");
+  source.append("  *res = 0.0; \n");
+  source.append("  for (unsigned int j = ind; j < row_n; ++j){ \n");
+  source.append("    if (j == ind){ \n");
+  source.append("      *res += v[ j]; \n");
+  source.append("    }else{ \n");
+  source.append("      *res += A[ j + ind*row_n]*v[ j]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+  source.append("void backwardSolve(__global "); source.append(numeric_string); source.append(" * R,  unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * y, __global "); source.append(numeric_string); source.append(" * x){ \n");
+  source.append("  for (int i = col_n-1; i >= 0; i--) { \n");
+  source.append("    x[ i] = y[ i]; \n");
+  source.append("    for (int j = i+1; j < col_n; ++j) { \n");
+  source.append("      x[ i] -= R[ i + j*row_n]*x[ j]; \n");
+  source.append("    } \n");
+  source.append("    x[i] /= R[ i + i*row_n]; \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+
+  source.append("void apply_q_trans_vec_ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global const "); source.append(numeric_string); source.append(" * b_v,  __global "); source.append(numeric_string); source.append(" * y){ \n");
+  source.append("            "); source.append(numeric_string); source.append(" inn_prod = 0; \n");
+  source.append("            for (unsigned int i = 0; i < col_n; ++i){ \n");
+  source.append("                custom_dot_prod_ls(R, row_n, y, i, &inn_prod); \n");
+  source.append("                for (unsigned int j = i; j < row_n; ++j){ \n");
+  source.append("                    if (i == j){ \n");
+  source.append("                        y[ j] -= b_v[ i]*inn_prod; \n");
+  source.append("                    } \n");
+  source.append("                    else{ \n");
+  source.append("                        y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n]; \n");
+  source.append("                    } \n");
+  source.append("                } \n");
+  source.append("            } \n");
+  source.append("        } \n");
+
+  source.append("void ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __global "); source.append(numeric_string); source.append(" * m_v, __global "); source.append(numeric_string); source.append(" * y_v){ \n");
+  source.append("  apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v); \n");
+  source.append("  //m_new - is m_v now \n");
+  source.append("  backwardSolve(R, row_n, col_n, y_v, m_v); \n");
+  source.append("} \n");
+
+  source.append("__kernel void block_least_squares( \n");
+  source.append("      __global "); source.append(numeric_string); source.append(" * global_R, \n");
+  source.append("      __global unsigned int * block_ind, \n");
+  source.append("      __global "); source.append(numeric_string); source.append(" * b_v, \n");
+  source.append("      __global unsigned int * start_bv_inds, \n");
+  source.append("      __global "); source.append(numeric_string); source.append(" * m_v, \n");
+  source.append("      __global "); source.append(numeric_string); source.append(" * y_v, \n");
+  source.append("      __global unsigned int * start_y_inds, \n");
+  source.append("      __global unsigned int * matrix_dimensions, \n");
+  source.append("      __global unsigned int * g_is_update, \n");
+  source.append("      unsigned int  block_elems_num) \n");
+  source.append("{ \n");
+  source.append("    for (unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+  source.append("        if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+  source.append("            ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] ); \n");
+  source.append("        } \n");
+  source.append("    } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_spai_block_q_mult(StringT & source, std::string const & numeric_string)
+{
+  source.append("void custom_dot_prod(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __local "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n");
+  source.append("  *res = 0.0; \n");
+  source.append("  for (unsigned int j = ind; j < row_n; ++j){ \n");
+  source.append("    if (j == ind){ \n");
+  source.append("      *res += v[j]; \n");
+  source.append("    }else{ \n");
+  source.append("      *res += A[j + ind*row_n]*v[j]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+  source.append("void apply_q_trans_vec(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * y){ \n");
+  source.append("  "); source.append(numeric_string); source.append(" inn_prod = 0; \n");
+  source.append("  for (unsigned int i = 0; i < col_n; ++i){ \n");
+  source.append("    custom_dot_prod(R, row_n, y, i, &inn_prod); \n");
+  source.append("    for (unsigned int j = i; j < row_n; ++j){ \n");
+  source.append("      if (i == j){ \n");
+  source.append("        y[j] -= b_v[ i]*inn_prod; \n");
+  source.append("      } \n");
+  source.append("      else{ \n");
+  source.append("        y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n]; \n");
+  source.append("      } \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+  source.append("void q_mult(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * R_u, unsigned int col_n_u){ \n");
+  source.append("        for (unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){ \n");
+  source.append("          apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i); \n");
+  source.append("        } \n");
+  source.append("} \n");
+
+  source.append("void matrix_from_global_to_local(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
+  source.append("  for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
+  source.append("    for (unsigned int j = 0; j < row_n; ++j){ \n");
+  source.append("      l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+  source.append("void matrix_from_local_to_global(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
+  source.append("  for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
+  source.append("    for (unsigned int j = 0; j < row_n; ++j){ \n");
+  source.append("      g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+  source.append("__kernel void block_q_mult(__global "); source.append(numeric_string); source.append(" * global_R, \n");
+  source.append("  __global unsigned int * block_ind, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * global_R_u, \n");
+  source.append("  __global unsigned int *block_ind_u, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * b_v, \n");
+  source.append("  __global unsigned int * start_bv_inds, \n");
+  source.append("  __global unsigned int * matrix_dimensions, \n");
+  source.append("  __global unsigned int * matrix_dimensions_u, \n");
+  source.append("  __global unsigned int * g_is_update, \n");
+  source.append("  __local  "); source.append(numeric_string); source.append(" * local_R_u, \n");
+  source.append("    unsigned int  block_elems_num){ \n");
+  source.append("    for (unsigned int i  = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
+  source.append("          if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){ \n");
+          //matrix_from_global_to_local(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
+  source.append("        matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("              q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, \n");
+  source.append("             matrix_dimensions_u[2*i + 1]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("              matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]); \n");
+  source.append("          } \n");
+  source.append("      } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_spai_block_qr(StringT & source, std::string const & numeric_string)
+{
+  source.append("void dot_prod(__local const "); source.append(numeric_string); source.append("* A, unsigned int n, unsigned int beg_ind, "); source.append(numeric_string); source.append("* res){ \n");
+  source.append("    *res = 0; \n");
+  source.append("    for (unsigned int i = beg_ind; i < n; ++i){ \n");
+  source.append("        *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i]; \n");
+  source.append("    } \n");
+  source.append("} \n");
+
+  source.append("void vector_div(__global "); source.append(numeric_string); source.append("* v, unsigned int beg_ind, "); source.append(numeric_string); source.append(" b, unsigned int n){ \n");
+  source.append("    for (unsigned int i = beg_ind; i < n; ++i){ \n");
+  source.append("        v[i] /= b; \n");
+  source.append("    } \n");
+  source.append("} \n");
+
+  source.append("void copy_vector(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, const unsigned int beg_ind, const unsigned int n){ \n");
+  source.append("    for (unsigned int i = beg_ind; i < n; ++i){ \n");
+  source.append("        v[i] = A[(beg_ind-1)*n + i]; \n");
+  source.append("    } \n");
+  source.append("} \n");
+
+
+  source.append("void householder_vector(__local const "); source.append(numeric_string); source.append("* A, unsigned int j, unsigned int n, __global "); source.append(numeric_string); source.append("* v, __global "); source.append(numeric_string); source.append("* b){ \n");
+  source.append("    "); source.append(numeric_string); source.append(" sg; \n");
+  source.append("    dot_prod(A, n, j+1, &sg); \n");
+  source.append("    copy_vector(A, v, j+1, n); \n");
+  source.append("    "); source.append(numeric_string); source.append(" mu; \n");
+  source.append("    v[j] = 1.0; \n");
+      //print_contigious_vector(v, v_start_ind, n);
+  source.append("    if (sg == 0){ \n");
+  source.append("        *b = 0; \n");
+  source.append("    } \n");
+  source.append("    else{ \n");
+  source.append("        mu = sqrt(A[j*n + j]*A[ j*n + j] + sg); \n");
+  source.append("        if (A[ j*n + j] <= 0){ \n");
+  source.append("            v[j] = A[ j*n + j] - mu; \n");
+  source.append("        }else{ \n");
+  source.append("            v[j] = -sg/(A[ j*n + j] + mu); \n");
+  source.append("        } \n");
+  source.append("    *b = 2*(v[j]*v[j])/(sg + v[j]*v[j]); \n");
+          //*b = (2*v[j]*v[j])/(sg + (v[j])*(v[j]));
+  source.append("        vector_div(v, j, v[j], n); \n");
+          //print_contigious_vector(v, v_start_ind, n);
+  source.append("    } \n");
+  source.append("} \n");
+
+  source.append("void custom_inner_prod(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, "); source.append(numeric_string); source.append("* res){ \n");
+  source.append("    for (unsigned int i = start_ind; i < row_num; ++i){ \n");
+  source.append("        *res += A[col_ind*row_num + i]*v[i]; \n");
+  source.append("    } \n");
+  source.append("} \n");
+  //
+  source.append("void apply_householder_reflection(__local "); source.append(numeric_string); source.append("* A,  unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global "); source.append(numeric_string); source.append("* v, "); source.append(numeric_string); source.append(" b){ \n");
+  source.append("    "); source.append(numeric_string); source.append(" in_prod_res; \n");
+  source.append("    for (unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){ \n");
+  source.append("        in_prod_res = 0.0; \n");
+  source.append("        custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res); \n");
+  source.append("        for (unsigned int j = iter_cnt; j < row_n; ++j){ \n");
+  source.append("            A[ i*row_n + j] -= b*in_prod_res* v[j]; \n");
+  source.append("        } \n");
+  source.append("    } \n");
+  source.append("} \n");
+
+  source.append("void store_householder_vector(__local "); source.append(numeric_string); source.append("* A,  unsigned int ind, unsigned int n, __global "); source.append(numeric_string); source.append("* v){ \n");
+  source.append("    for (unsigned int i = ind; i < n; ++i){ \n");
+  source.append("        A[ (ind-1)*n + i] = v[i]; \n");
+  source.append("    } \n");
+  source.append("} \n");
+
+  source.append("void single_qr( __local "); source.append(numeric_string); source.append("* R, __global unsigned int* matrix_dimensions, __global "); source.append(numeric_string); source.append("* b_v, __global "); source.append(numeric_string); source.append("* v, unsigned int matrix_ind){ \n");
+              //matrix_dimensions[0] - number of rows
+                //matrix_dimensions[1] - number of columns
+  source.append("  unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
+  source.append("  unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
+
+  source.append("  if ((col_n == row_n)&&(row_n == 1)){ \n");
+  source.append("    b_v[0] = 0.0; \n");
+  source.append("      return; \n");
+  source.append("  } \n");
+  source.append("  for (unsigned int i = 0; i < col_n; ++i){ \n");
+  source.append("    if (get_local_id(0) == 0){ \n");
+  source.append("      householder_vector(R, i, row_n, v, b_v + i); \n");
+  source.append("    } \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]); \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) == 0){ \n");
+  source.append("      if (i < matrix_dimensions[2*matrix_ind]){ \n");
+  source.append("        store_householder_vector(R, i+1, row_n, v); \n");
+  source.append("      } \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+  source.append("void matrix_from_global_to_local_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
+  source.append("  for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
+  source.append("    for (unsigned int j = 0; j < row_n; ++j){ \n");
+  source.append("      l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("} \n");
+  source.append("void matrix_from_local_to_global_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
+  source.append("  for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
+  source.append("    for (unsigned int j = 0; j < row_n; ++j){ \n");
+  source.append("      g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+
+  source.append("__kernel void block_qr( \n");
+  source.append("      __global "); source.append(numeric_string); source.append("* R, \n");
+  source.append("      __global unsigned int* matrix_dimensions, \n");
+  source.append("      __global "); source.append(numeric_string); source.append("* b_v, \n");
+  source.append("      __global "); source.append(numeric_string); source.append("* v, \n");
+  source.append("      __global unsigned int* start_matrix_inds, \n");
+  source.append("      __global unsigned int* start_bv_inds, \n");
+  source.append("      __global unsigned int* start_v_inds, \n");
+  source.append("      __global unsigned int * g_is_update, \n");
+  source.append("      __local "); source.append(numeric_string); source.append("* local_buff_R, \n");
+  source.append("      unsigned int block_elems_num){ \n");
+  source.append("    for (unsigned int i  = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
+  source.append("        if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+  source.append("      matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("            single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i); \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("            matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
+  source.append("        } \n");
+  source.append("    } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_spai_block_qr_assembly(StringT & source, std::string const & numeric_string)
+{
+  source.append("void assemble_upper_part(__global "); source.append(numeric_string); source.append(" * R_q, \n");
+  source.append("            unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n");
+  source.append("            unsigned int row_n_u, unsigned int col_n_u, \n");
+  source.append("            unsigned int col_n, unsigned int diff){ \n");
+  source.append("            for (unsigned int i = 0; i < col_n_q; ++i){ \n");
+  source.append("                for (unsigned int j = 0; j < diff; ++j){ \n");
+  source.append("          R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ]; \n");
+  source.append("                } \n");
+  source.append("            } \n");
+  source.append("        } \n");
+
+  source.append("void assemble_lower_part(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
+  source.append("             unsigned int row_n_u_u, unsigned int col_n_u_u, \n");
+  source.append("             unsigned int diff){ \n");
+  source.append("  for (unsigned int i = 0; i < col_n_u_u; ++i){ \n");
+  source.append("    for (unsigned int j = 0; j < row_n_u_u; ++j){ \n");
+  source.append("      R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+  source.append("void assemble_qr_block(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n");
+  source.append("            unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){ \n");
+  source.append("            unsigned int diff = row_n_u - col_n; \n");
+  source.append("            assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
+  source.append("            if (diff > 0){ \n");
+  source.append("              assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff); \n");
+  source.append("            } \n");
+  source.append("} \n");
+
+  source.append("__kernel void block_qr_assembly( \n");
+  source.append("      __global unsigned int * matrix_dimensions, \n");
+  source.append("      __global "); source.append(numeric_string); source.append(" * R_u, \n");
+  source.append("      __global unsigned int * block_ind_u, \n");
+  source.append("      __global unsigned int * matrix_dimensions_u, \n");
+  source.append("      __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
+  source.append("      __global unsigned int * block_ind_u_u, \n");
+  source.append("      __global unsigned int * matrix_dimensions_u_u, \n");
+  source.append("      __global "); source.append(numeric_string); source.append(" * R_q, \n");
+  source.append("      __global unsigned int * block_ind_q, \n");
+  source.append("      __global unsigned int * matrix_dimensions_q, \n");
+  source.append("      __global unsigned int * g_is_update, \n");
+  source.append("          //__local  "); source.append(numeric_string); source.append(" * local_R_q, \n");
+  source.append("      unsigned int  block_elems_num) \n");
+  source.append("{ \n");
+  source.append("    for (unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+  source.append("        if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+  source.append("           assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
+  source.append("             matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
+  source.append("       } \n");
+  source.append("   } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_spai_block_qr_assembly_1(StringT & source, std::string const & numeric_string)
+{
+  source.append("void assemble_upper_part_1(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n");
+  source.append("             unsigned int row_n_u, unsigned int col_n_u, \n");
+  source.append("             unsigned int col_n, unsigned int diff){ \n");
+  source.append("            for (unsigned int i = 0; i < col_n_q; ++i){ \n");
+  source.append("                for (unsigned int j = 0; j < diff; ++j){ \n");
+  source.append("          R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ]; \n");
+  source.append("                } \n");
+  source.append("            } \n");
+  source.append("        } \n");
+
+
+  source.append("void assemble_qr_block_1(__global "); source.append(numeric_string); source.append(" * R_q,  unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n");
+  source.append("            unsigned int col_n_u, unsigned int col_n){ \n");
+  source.append("            unsigned int diff = row_n_u - col_n; \n");
+  source.append("            assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
+  source.append("} \n");
+
+  source.append("__kernel void block_qr_assembly_1( \n");
+  source.append("  __global unsigned int * matrix_dimensions, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * R_u, \n");
+  source.append("  __global unsigned int * block_ind_u, \n");
+  source.append("  __global unsigned int * matrix_dimensions_u, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * R_q, \n");
+  source.append("  __global unsigned int * block_ind_q, \n");
+  source.append("  __global unsigned int * matrix_dimensions_q, \n");
+  source.append("  __global unsigned int * g_is_update, \n");
+  source.append("  unsigned int  block_elems_num) \n");
+  source.append("{ \n");
+  source.append("    for (unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+  source.append("        if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+  source.append("            assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
+  source.append("              matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
+  source.append("        } \n");
+  source.append("    } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_spai_block_r_assembly(StringT & source, std::string const & numeric_string)
+{
+  source.append("void assemble_r(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, \n");
+  source.append("        unsigned int row_n, unsigned int col_n) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int i = 0; i < col_n; ++i){ \n");
+  source.append("     for (unsigned int j = 0; j < row_n; ++j){ \n");
+  source.append("    gR[i*row_n_r + j] = R[i*row_n + j ]; \n");
+  source.append("     } \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+  source.append("void assemble_r_u(__global "); source.append(numeric_string); source.append(" * gR, \n");
+  source.append("          unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, \n");
+  source.append("          unsigned int col_n) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int i = 0; i < col_n_u; ++i){ \n");
+  source.append("    for (unsigned int j = 0; j < col_n; ++j){ \n");
+  source.append("      gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+
+  source.append("void assemble_r_u_u(__global "); source.append(numeric_string); source.append(" * gR,  unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, \n");
+  source.append("          unsigned int col_n_u_u, unsigned int col_n) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int i = 0; i < col_n_u_u; ++i){ \n");
+  source.append("    for (unsigned int j = 0; j < row_n_u_u; ++j){ \n");
+  source.append("      gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j]; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+  source.append("void assemble_r_block(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, \n");
+  source.append("        unsigned int col_n, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
+  source.append("        unsigned int row_n_u_u, unsigned int col_n_u_u){ \n");
+  source.append("        assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n); \n");
+  source.append("        assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n); \n");
+  source.append("        assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n); \n");
+  source.append("} \n");
+
+
+  source.append("__kernel void block_r_assembly( \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * R, \n");
+  source.append("  __global unsigned int * block_ind, \n");
+  source.append("  __global unsigned int * matrix_dimensions, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * R_u, \n");
+  source.append("  __global unsigned int * block_ind_u, \n");
+  source.append("  __global unsigned int * matrix_dimensions_u, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
+  source.append("  __global unsigned int * block_ind_u_u, \n");
+  source.append("  __global unsigned int * matrix_dimensions_u_u, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * g_R, \n");
+  source.append("  __global unsigned int * block_ind_r, \n");
+  source.append("  __global unsigned int * matrix_dimensions_r, \n");
+  source.append("  __global unsigned int * g_is_update, \n");
+  source.append("  unsigned int  block_elems_num) \n");
+  source.append("{ \n");
+  source.append("    for (unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+  source.append("        if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+
+  source.append("            assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], \n");
+  source.append("              matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], \n");
+  source.append("              R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]); \n");
+
+  source.append("        } \n");
+  source.append("    } \n");
+  source.append("} \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for the sparse approximate inverse preconditioners. */
+template<typename NumericT>
+struct spai
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_spai";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(1024);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      generate_spai_assemble_blocks(source, numeric_string);
+      generate_spai_block_bv_assembly(source, numeric_string);
+      generate_spai_block_least_squares(source, numeric_string);
+      generate_spai_block_q_mult(source, numeric_string);
+      generate_spai_block_qr(source, numeric_string);
+      generate_spai_block_qr_assembly(source, numeric_string);
+      generate_spai_block_qr_assembly_1(source, numeric_string);
+      generate_spai_block_r_assembly(source, numeric_string);
+
+      std::string prog_name = program_name();
+      #ifdef VIENNACL_BUILD_INFO
+      std::cout << "Creating program " << prog_name << std::endl;
+      #endif
+      ctx.add_program(source, prog_name);
+      init_done[ctx.handle().get()] = true;
+    } //if
+  } //init
+};
+
+}  // namespace kernels
+}  // namespace opencl
+}  // namespace linalg
+}  // namespace viennacl
+#endif
+


[36/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp
new file mode 100644
index 0000000..4330431
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp
@@ -0,0 +1,2049 @@
+#ifndef VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/iterative_operations.hpp
+    @brief Implementations of operations using sparse matrices using CUDA
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+//
+// CG vector update:
+//
+
+// cpu scalar
+template<typename NumericT>
+__global__ void pipelined_cg_vector_kernel(NumericT * result,
+                                           NumericT alpha,
+                                           NumericT * p,
+                                           NumericT * r,
+                                           NumericT const * Ap,
+                                           NumericT beta,
+                                           NumericT * inner_prod_buffer,
+                                           unsigned int size)
+{
+  NumericT inner_prod_contrib = 0;
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+  {
+    NumericT value_p = p[i];
+    NumericT value_r = r[i];
+
+    result[i] += alpha * value_p;
+    value_r   -= alpha * Ap[i];
+    value_p    = value_r + beta * value_p;
+
+    p[i] = value_p;
+    r[i] = value_r;
+    inner_prod_contrib += value_r * value_r;
+  }
+
+  // parallel reduction in work group
+  __shared__ NumericT shared_array[256];
+  shared_array[threadIdx.x] = inner_prod_contrib;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+      shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0)
+    inner_prod_buffer[blockIdx.x] = shared_array[0];
+}
+
+
+template<typename NumericT>
+void pipelined_cg_vector_update(vector_base<NumericT> & result,
+                                NumericT alpha,
+                                vector_base<NumericT> & p,
+                                vector_base<NumericT> & r,
+                                vector_base<NumericT> const & Ap,
+                                NumericT beta,
+                                vector_base<NumericT> & inner_prod_buffer)
+{
+  unsigned int size = result.size();
+  pipelined_cg_vector_kernel<<<128, 128>>>(viennacl::cuda_arg(result),
+                                           alpha,
+                                           viennacl::cuda_arg(p),
+                                           viennacl::cuda_arg(r),
+                                           viennacl::cuda_arg(Ap),
+                                           beta,
+                                           viennacl::cuda_arg(inner_prod_buffer),
+                                           size);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_vector_kernel");
+}
+
+
+
+
+//
+// Compressed matrix
+//
+
+
+template<unsigned int SubWarpSizeV, typename NumericT>
+__global__ void pipelined_cg_csr_vec_mul_blocked_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          const NumericT * p,
+          NumericT * Ap,
+          unsigned int size,
+          NumericT * inner_prod_buffer,
+          unsigned int buffer_size)
+{
+  __shared__ NumericT shared_elements[256];
+  NumericT inner_prod_ApAp = 0;
+  NumericT inner_prod_pAp = 0;
+
+  const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
+  const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * blockDim.x) + 1);
+  const unsigned int block_start = blockIdx.x * block_increment;
+  const unsigned int block_stop  = min(block_start + block_increment, size);
+
+  for (unsigned int row  = block_start + threadIdx.x / SubWarpSizeV;
+                    row  < block_stop;
+                    row += blockDim.x / SubWarpSizeV)
+  {
+    NumericT dot_prod = NumericT(0);
+    unsigned int row_end = row_indices[row+1];
+    for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV)
+      dot_prod += elements[i] * p[column_indices[i]];
+
+    shared_elements[threadIdx.x] = dot_prod;
+    if (1  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  1];
+    if (2  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  2];
+    if (4  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  4];
+    if (8  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  8];
+    if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
+
+    if (id_in_row == 0)
+    {
+      Ap[row] = shared_elements[threadIdx.x];
+      inner_prod_ApAp += shared_elements[threadIdx.x] * shared_elements[threadIdx.x];
+      inner_prod_pAp  +=                       p[row] * shared_elements[threadIdx.x];
+    }
+  }
+
+  ////////// parallel reduction in work group
+  __shared__ NumericT shared_array_ApAp[256];
+  __shared__ NumericT shared_array_pAp[256];
+  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
+    }
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0) {
+    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
+    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+  }
+
+}
+
+template<typename NumericT>
+__global__ void pipelined_cg_csr_vec_mul_adaptive_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const unsigned int * row_blocks,
+          const NumericT * elements,
+          unsigned int num_blocks,
+          const NumericT * p,
+          NumericT * Ap,
+          unsigned int size,
+          NumericT * inner_prod_buffer,
+          unsigned int buffer_size)
+{
+  NumericT inner_prod_ApAp = 0;
+  NumericT inner_prod_pAp = 0;
+
+  __shared__ NumericT     shared_elements[1024];
+
+  for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
+  {
+    unsigned int row_start = row_blocks[block_id];
+    unsigned int row_stop  = row_blocks[block_id + 1];
+    unsigned int element_start = row_indices[row_start];
+    unsigned int element_stop = row_indices[row_stop];
+    unsigned int rows_to_process = row_stop - row_start;
+
+    if (rows_to_process > 1)  // CSR stream with one thread per row
+    {
+      // load to shared buffer:
+      for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
+        shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
+
+      __syncthreads();
+
+      // use one thread per row to sum:
+      for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
+      {
+        NumericT dot_prod = 0;
+        unsigned int thread_row_start = row_indices[row]     - element_start;
+        unsigned int thread_row_stop  = row_indices[row + 1] - element_start;
+        for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
+          dot_prod += shared_elements[i];
+        Ap[row] = dot_prod;
+        inner_prod_ApAp += dot_prod * dot_prod;
+        inner_prod_pAp  +=   p[row] * dot_prod;
+      }
+    }
+    // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
+    else // CSR vector for a single row
+    {
+      // load and sum to shared buffer:
+      shared_elements[threadIdx.x] = 0;
+      for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
+        shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
+
+      // reduction to obtain final result
+      for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+      {
+        __syncthreads();
+        if (threadIdx.x < stride)
+          shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
+      }
+
+      if (threadIdx.x == 0)
+      {
+        Ap[row_start] = shared_elements[0];
+        inner_prod_ApAp += shared_elements[0] * shared_elements[0];
+        inner_prod_pAp  +=       p[row_start] * shared_elements[0];
+      }
+    }
+
+    __syncthreads();  // avoid race conditions
+  }
+
+  ////////// parallel reduction in work group
+  __shared__ NumericT shared_array_ApAp[256];
+  __shared__ NumericT shared_array_pAp[256];
+  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
+    }
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0) {
+    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
+    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+  }
+}
+
+
+
+
+template<typename NumericT>
+void pipelined_cg_prod(compressed_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  unsigned int size = p.size();
+  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
+  if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads expected to idle
+  {
+    pipelined_cg_csr_vec_mul_blocked_kernel<8,  NumericT><<<256, 256>>>(   // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
+#else
+  if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads expected to idle
+  {
+    pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>(   // Fermi and Kepler prefer 16 threads per row (half-warp)
+#endif
+                                                                        viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                                        viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                                        viennacl::cuda_arg<NumericT>(A.handle()),
+                                                                        viennacl::cuda_arg(p),
+                                                                        viennacl::cuda_arg(Ap),
+                                                                        size,
+                                                                        viennacl::cuda_arg(inner_prod_buffer),
+                                                                        buffer_size_per_vector
+                                                                       );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel");
+  }
+  else
+  {
+    pipelined_cg_csr_vec_mul_adaptive_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                           viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                           viennacl::cuda_arg<unsigned int>(A.handle3()),
+                                                           viennacl::cuda_arg<NumericT>(A.handle()),
+                                                           static_cast<unsigned int>(A.blocks1()),
+                                                           viennacl::cuda_arg(p),
+                                                           viennacl::cuda_arg(Ap),
+                                                           size,
+                                                           viennacl::cuda_arg(inner_prod_buffer),
+                                                           buffer_size_per_vector);
+    VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_kernel");
+  }
+}
+
+
+//
+// Coordinate Matrix
+//
+
+
+template<typename NumericT>
+__global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
+                                                const NumericT * elements,
+                                                const unsigned int * group_boundaries,
+                                                const NumericT * p,
+                                                NumericT * Ap,
+                                                unsigned int size,
+                                                NumericT * inner_prod_buffer,
+                                                unsigned int buffer_size)
+{
+  NumericT inner_prod_ApAp = 0;
+  NumericT inner_prod_pAp  = 0;
+  __shared__ unsigned int shared_rows[128];
+  __shared__ NumericT inter_results[128];
+
+  uint2 tmp;
+  NumericT val;
+  unsigned int group_start = group_boundaries[blockIdx.x];
+  unsigned int group_end   = group_boundaries[blockIdx.x + 1];
+  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+  unsigned int local_index = 0;
+
+  for (unsigned int k = 0; k < k_end; ++k)
+  {
+    local_index = group_start + k * blockDim.x + threadIdx.x;
+
+    tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+    val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
+
+    //check for carry from previous loop run:
+    if (threadIdx.x == 0 && k > 0)
+    {
+      if (tmp.x == shared_rows[blockDim.x-1])
+        val += inter_results[blockDim.x-1];
+      else
+      {
+        NumericT Ap_entry = inter_results[blockDim.x-1];
+        Ap[shared_rows[blockDim.x-1]] = Ap_entry;
+        inner_prod_ApAp += Ap_entry * Ap_entry;
+        inner_prod_pAp  += Ap_entry * p[shared_rows[blockDim.x-1]];
+      }
+    }
+
+    //segmented parallel reduction begin
+    __syncthreads();
+    shared_rows[threadIdx.x] = tmp.x;
+    inter_results[threadIdx.x] = val;
+    NumericT left = 0;
+    __syncthreads();
+
+    for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+    {
+      left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+      __syncthreads();
+      inter_results[threadIdx.x] += left;
+      __syncthreads();
+    }
+    //segmented parallel reduction end
+
+    if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
+        shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
+    {
+      NumericT Ap_entry = inter_results[threadIdx.x];
+      Ap[tmp.x] = Ap_entry;
+      inner_prod_ApAp += Ap_entry * Ap_entry;
+      inner_prod_pAp  += Ap_entry * p[tmp.x];
+    }
+
+    __syncthreads();
+  } //for k
+
+  if (local_index + 1 == group_end)
+  {
+    NumericT Ap_entry = inter_results[threadIdx.x];
+    Ap[tmp.x] = Ap_entry;
+    inner_prod_ApAp += Ap_entry * Ap_entry;
+    inner_prod_pAp  += Ap_entry * p[tmp.x];
+  }
+
+  ////////// parallel reduction in work group
+  __shared__ NumericT shared_array_ApAp[256];
+  __shared__ NumericT shared_array_pAp[256];
+  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
+    }
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0) {
+    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
+    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+  }
+
+}
+
+
+template<typename NumericT>
+void pipelined_cg_prod(coordinate_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  unsigned int size = p.size();
+  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+  Ap.clear();
+
+  pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle12()),
+                                                viennacl::cuda_arg<NumericT>(A.handle()),
+                                                viennacl::cuda_arg<unsigned int>(A.handle3()),
+                                                viennacl::cuda_arg(p),
+                                                viennacl::cuda_arg(Ap),
+                                                size,
+                                                viennacl::cuda_arg(inner_prod_buffer),
+                                                buffer_size_per_vector);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel");
+}
+
+
+
+//
+// ELL Matrix
+//
+
+template<typename NumericT>
+__global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int * coords,
+                                                const NumericT * elements,
+                                                unsigned int internal_row_num,
+                                                unsigned int items_per_row,
+                                                const NumericT * p,
+                                                NumericT * Ap,
+                                                unsigned int size,
+                                                NumericT * inner_prod_buffer,
+                                                unsigned int buffer_size)
+{
+  NumericT inner_prod_ApAp = 0;
+  NumericT inner_prod_pAp  = 0;
+  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int glb_sz = gridDim.x * blockDim.x;
+
+  for (unsigned int row = glb_id; row < size; row += glb_sz)
+  {
+    NumericT sum = 0;
+
+    unsigned int offset = row;
+    for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
+    {
+      NumericT val = elements[offset];
+      sum += val ? p[coords[offset]] * val : NumericT(0);
+    }
+
+    Ap[row] = sum;
+    inner_prod_ApAp += sum * sum;
+    inner_prod_pAp  += sum * p[row];
+  }
+
+  ////////// parallel reduction in work group
+  __shared__ NumericT shared_array_ApAp[256];
+  __shared__ NumericT shared_array_pAp[256];
+  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
+    }
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0) {
+    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
+    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+  }
+}
+
+
+template<typename NumericT>
+void pipelined_cg_prod(ell_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  unsigned int size = p.size();
+  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+  pipelined_cg_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                viennacl::cuda_arg<NumericT>(A.handle()),
+                                                static_cast<unsigned int>(A.internal_size1()),
+                                                static_cast<unsigned int>(A.maxnnz()),
+                                                viennacl::cuda_arg(p),
+                                                viennacl::cuda_arg(Ap),
+                                                size,
+                                                viennacl::cuda_arg(inner_prod_buffer),
+                                                buffer_size_per_vector);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel");
+}
+
+
+//
+// SELL-C-\sigma Matrix
+//
+
+template<typename NumericT>
+__global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int * columns_per_block,
+                                                       const unsigned int * column_indices,
+                                                       const unsigned int * block_start,
+                                                       const NumericT * elements,
+                                                       const NumericT * p,
+                                                       NumericT * Ap,
+                                                       unsigned int size,
+                                                       unsigned int block_size,
+                                                       NumericT * inner_prod_buffer,
+                                                       unsigned int buffer_size)
+{
+  NumericT inner_prod_ApAp = 0;
+  NumericT inner_prod_pAp  = 0;
+
+  unsigned int blocks_per_threadblock = blockDim.x / block_size;
+  unsigned int id_in_block = threadIdx.x % block_size;
+  unsigned int num_blocks = (size - 1) / block_size + 1;
+  unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
+  unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
+
+  for (unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
+  {
+    unsigned int row         = block_idx * block_size + id_in_block;
+    unsigned int offset      = block_start[block_idx];
+    unsigned int num_columns = columns_per_block[block_idx];
+
+    NumericT sum = 0;
+    for (unsigned int item_id = 0; item_id < num_columns; item_id++)
+    {
+      unsigned int index = offset + item_id * block_size + id_in_block;
+      NumericT val = elements[index];
+
+      sum += val ? (p[column_indices[index]] * val) : 0;
+    }
+
+    if (row < size)
+    {
+      Ap[row] = sum;
+      inner_prod_ApAp += sum * sum;
+      inner_prod_pAp  += sum * p[row];
+    }
+  }
+
+  ////////// parallel reduction in work group
+  __shared__ NumericT shared_array_ApAp[256];
+  __shared__ NumericT shared_array_pAp[256];
+  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
+    }
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0) {
+    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
+    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+  }
+}
+
+template<typename NumericT>
+void pipelined_cg_prod(sliced_ell_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  unsigned int size = p.size();
+  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+  pipelined_cg_sliced_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                       viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                       viennacl::cuda_arg<unsigned int>(A.handle3()),
+                                                       viennacl::cuda_arg<NumericT>(A.handle()),
+                                                       viennacl::cuda_arg(p),
+                                                       viennacl::cuda_arg(Ap),
+                                                       size,
+                                                       static_cast<unsigned int>(A.rows_per_block()),
+                                                       viennacl::cuda_arg(inner_prod_buffer),
+                                                       buffer_size_per_vector);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_sliced_ell_vec_mul_kernel");
+}
+
+
+//
+// Hybrid Matrix
+//
+
+
+template<typename NumericT>
+__global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int * ell_coords,
+                                                const NumericT * ell_elements,
+                                                const unsigned int * csr_rows,
+                                                const unsigned int * csr_cols,
+                                                const NumericT * csr_elements,
+                                                unsigned int internal_row_num,
+                                                unsigned int items_per_row,
+                                                const NumericT * p,
+                                                NumericT * Ap,
+                                                unsigned int size,
+                                                NumericT * inner_prod_buffer,
+                                                unsigned int buffer_size)
+{
+  NumericT inner_prod_ApAp = 0;
+  NumericT inner_prod_pAp  = 0;
+  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int glb_sz = gridDim.x * blockDim.x;
+
+  for (unsigned int row = glb_id; row < size; row += glb_sz)
+  {
+    NumericT sum = 0;
+
+    unsigned int offset = row;
+    for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
+    {
+      NumericT val = ell_elements[offset];
+
+      sum += val ? p[ell_coords[offset]] * val : NumericT(0);
+    }
+
+    unsigned int col_begin = csr_rows[row];
+    unsigned int col_end   = csr_rows[row + 1];
+
+    for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
+    {
+      sum += p[csr_cols[item_id]] * csr_elements[item_id];
+    }
+
+    Ap[row] = sum;
+    inner_prod_ApAp += sum * sum;
+    inner_prod_pAp  += sum * p[row];
+  }
+
+  ////////// parallel reduction in work group
+  __shared__ NumericT shared_array_ApAp[256];
+  __shared__ NumericT shared_array_pAp[256];
+  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
+    }
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0) {
+    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
+    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+  }
+}
+
+
+
+template<typename NumericT>
+void pipelined_cg_prod(hyb_matrix<NumericT> const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  unsigned int size = p.size();
+  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+  pipelined_cg_hyb_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                viennacl::cuda_arg<NumericT>(A.handle()),
+                                                viennacl::cuda_arg<unsigned int>(A.handle3()),
+                                                viennacl::cuda_arg<unsigned int>(A.handle4()),
+                                                viennacl::cuda_arg<NumericT>(A.handle5()),
+                                                static_cast<unsigned int>(A.internal_size1()),
+                                                static_cast<unsigned int>(A.ell_nnz()),
+                                                viennacl::cuda_arg(p),
+                                                viennacl::cuda_arg(Ap),
+                                                size,
+                                                viennacl::cuda_arg(inner_prod_buffer),
+                                                buffer_size_per_vector);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_hyb_vec_mul_kernel");
+}
+
+
+
+/////////////////////////////////////
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_update_s_kernel(NumericT * s,
+                                                   NumericT const * residual,
+                                                   NumericT const * Ap,
+                                                   unsigned int size,
+                                                   NumericT * inner_prod_buffer,
+                                                   unsigned int chunk_size,
+                                                   unsigned int chunk_offset)
+{
+  NumericT alpha = 0;
+
+  // parallel reduction in work group to compute <r, r0> / <Ap, r0>
+  __shared__ NumericT shared_array[256];
+  __shared__ NumericT shared_array_Ap_in_r0[256];
+
+  shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x];
+  shared_array_Ap_in_r0[threadIdx.x] = inner_prod_buffer[threadIdx.x + 3 * chunk_size];
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride) {
+      shared_array[threadIdx.x]          += shared_array[threadIdx.x + stride];
+      shared_array_Ap_in_r0[threadIdx.x] += shared_array_Ap_in_r0[threadIdx.x + stride];
+    }
+  }
+
+  // compute alpha from reduced values:
+  __syncthreads();
+  alpha = shared_array[0] / shared_array_Ap_in_r0[0];
+
+  // run vector update and compute first stage of <s, s>
+  NumericT inner_prod_contrib = 0;
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+  {
+    NumericT value_s = s[i];
+
+    value_s = residual[i] - alpha * Ap[i];
+    inner_prod_contrib += value_s * value_s;
+
+    s[i] = value_s;
+  }
+  __syncthreads();
+
+  // parallel reduction in work group
+  shared_array[threadIdx.x] = inner_prod_contrib;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+      shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
+  }
+
+  // write results to inner_prod_buffer
+  if (threadIdx.x == 0)
+    inner_prod_buffer[blockIdx.x + chunk_offset] = shared_array[0];
+}
+
+template<typename NumericT>
+void pipelined_bicgstab_update_s(vector_base<NumericT> & s,
+                                 vector_base<NumericT> & r,
+                                 vector_base<NumericT> const & Ap,
+                                 vector_base<NumericT> & inner_prod_buffer,
+                                 vcl_size_t buffer_chunk_size,
+                                 vcl_size_t buffer_chunk_offset)
+{
+  unsigned int size = static_cast<unsigned int>(s.size());
+  unsigned int chunk_size   = static_cast<unsigned int>(buffer_chunk_size);
+  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
+
+  pipelined_bicgstab_update_s_kernel<<<256, 256>>>(viennacl::cuda_arg(s),
+                                                   viennacl::cuda_arg(r),
+                                                   viennacl::cuda_arg(Ap),
+                                                   size,
+                                                   viennacl::cuda_arg(inner_prod_buffer),
+                                                   chunk_size,
+                                                   chunk_offset);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_update_s_kernel");
+}
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_vector_kernel(NumericT * result,
+                                                 NumericT alpha,
+                                                 NumericT * p,
+                                                 NumericT omega,
+                                                 NumericT const * s,
+                                                 NumericT * residual,
+                                                 NumericT const * As,
+                                                 NumericT beta,
+                                                 NumericT const * Ap,
+                                                 NumericT const * r0star,
+                                                 NumericT * inner_prod_buffer,
+                                                 unsigned int size)
+{
+  NumericT inner_prod_r_r0star = 0;
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+  {
+    NumericT value_result = result[i];
+    NumericT value_p = p[i];
+    NumericT value_s = s[i];
+    NumericT value_residual = residual[i];
+    NumericT value_As = As[i];
+    NumericT value_Ap = Ap[i];
+    NumericT value_r0star = r0star[i];
+
+    value_result   += alpha * value_p + omega * value_s;
+    value_residual  = value_s - omega * value_As;
+    value_p         = value_residual + beta * (value_p - omega * value_Ap);
+
+    result[i]   = value_result;
+    residual[i] = value_residual;
+    p[i]        = value_p;
+    inner_prod_r_r0star += value_residual * value_r0star;
+  }
+
+  // parallel reduction in work group
+  __shared__ NumericT shared_array[256];
+  shared_array[threadIdx.x] = inner_prod_r_r0star;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+      shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0)
+    inner_prod_buffer[blockIdx.x] = shared_array[0];
+}
+
+
+template<typename NumericT>
+void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,
+                                      vector_base<NumericT> & residual, vector_base<NumericT> const & As,
+                                      NumericT beta, vector_base<NumericT> const & Ap,
+                                      vector_base<NumericT> const & r0star,
+                                      vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size)
+{
+  (void)buffer_chunk_size;
+  unsigned int size = static_cast<unsigned int>(result.size());
+
+  pipelined_bicgstab_vector_kernel<<<256, 256>>>(viennacl::cuda_arg(result),
+                                                 alpha,
+                                                 viennacl::cuda_arg(p),
+                                                 omega,
+                                                 viennacl::cuda_arg(s),
+                                                 viennacl::cuda_arg(residual),
+                                                 viennacl::cuda_arg(As),
+                                                 beta,
+                                                 viennacl::cuda_arg(Ap),
+                                                 viennacl::cuda_arg(r0star),
+                                                 viennacl::cuda_arg(inner_prod_buffer),
+                                                 size);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_vector_kernel");
+}
+
+
+
+//
+// Compressed matrix
+//
+
+
+template<unsigned int SubWarpSizeV, typename NumericT>
+__global__ void pipelined_bicgstab_csr_vec_mul_blocked_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          const NumericT * p,
+          NumericT * Ap,
+          const NumericT * r0star,
+          unsigned int size,
+          NumericT * inner_prod_buffer,
+          unsigned int buffer_size,
+          unsigned int buffer_offset)
+{
+  __shared__ NumericT shared_elements[256];
+  NumericT inner_prod_ApAp = 0;
+  NumericT inner_prod_pAp = 0;
+  NumericT inner_prod_r0Ap  = 0;
+
+  const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
+  const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * blockDim.x) + 1);
+  const unsigned int block_start = blockIdx.x * block_increment;
+  const unsigned int block_stop  = min(block_start + block_increment, size);
+
+  for (unsigned int row  = block_start + threadIdx.x / SubWarpSizeV;
+                    row  < block_stop;
+                    row += blockDim.x / SubWarpSizeV)
+  {
+    NumericT dot_prod = NumericT(0);
+    unsigned int row_end = row_indices[row+1];
+    for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV)
+      dot_prod += elements[i] * p[column_indices[i]];
+
+    shared_elements[threadIdx.x] = dot_prod;
+    if (1  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  1];
+    if (2  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  2];
+    if (4  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  4];
+    if (8  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  8];
+    if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
+
+    if (id_in_row == 0)
+    {
+      Ap[row] = shared_elements[threadIdx.x];
+      inner_prod_ApAp += shared_elements[threadIdx.x] * shared_elements[threadIdx.x];
+      inner_prod_pAp  +=                       p[row] * shared_elements[threadIdx.x];
+      inner_prod_r0Ap +=                  r0star[row] * shared_elements[threadIdx.x];
+    }
+  }
+
+  ////////// parallel reduction in work group
+  __shared__ NumericT shared_array_ApAp[256];
+  __shared__ NumericT shared_array_pAp[256];
+  __shared__ NumericT shared_array_r0Ap[256];
+  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
+  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
+      shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
+    }
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0) {
+    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
+    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+    inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
+  }
+
+}
+
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_csr_vec_mul_adaptive_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const unsigned int * row_blocks,
+          const NumericT * elements,
+          unsigned int num_blocks,
+          const NumericT * p,
+          NumericT * Ap,
+          const NumericT * r0star,
+          unsigned int size,
+          NumericT * inner_prod_buffer,
+          unsigned int buffer_size,
+          unsigned int buffer_offset)
+{
+  NumericT inner_prod_ApAp = 0;
+  NumericT inner_prod_pAp = 0;
+  NumericT inner_prod_r0Ap  = 0;
+
+  __shared__ NumericT     shared_elements[1024];
+
+  for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
+  {
+    unsigned int row_start = row_blocks[block_id];
+    unsigned int row_stop  = row_blocks[block_id + 1];
+    unsigned int element_start = row_indices[row_start];
+    unsigned int element_stop = row_indices[row_stop];
+    unsigned int rows_to_process = row_stop - row_start;
+
+    if (rows_to_process > 1)  // CSR stream with one thread per row
+    {
+      // load to shared buffer:
+      for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
+        shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
+
+      __syncthreads();
+
+      // use one thread per row to sum:
+      for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
+      {
+        NumericT dot_prod = 0;
+        unsigned int thread_row_start = row_indices[row]     - element_start;
+        unsigned int thread_row_stop  = row_indices[row + 1] - element_start;
+        for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
+          dot_prod += shared_elements[i];
+        Ap[row] = dot_prod;
+        inner_prod_ApAp += dot_prod * dot_prod;
+        inner_prod_pAp  +=   p[row] * dot_prod;
+        inner_prod_r0Ap += r0star[row] * dot_prod;
+      }
+    }
+    // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
+    else // CSR vector for a single row
+    {
+      // load and sum to shared buffer:
+      shared_elements[threadIdx.x] = 0;
+      for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
+        shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
+
+      // reduction to obtain final result
+      for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+      {
+        __syncthreads();
+        if (threadIdx.x < stride)
+          shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
+      }
+
+      if (threadIdx.x == 0)
+      {
+        Ap[row_start] = shared_elements[0];
+        inner_prod_ApAp += shared_elements[0] * shared_elements[0];
+        inner_prod_pAp  +=       p[row_start] * shared_elements[0];
+        inner_prod_r0Ap +=  r0star[row_start] * shared_elements[0];
+      }
+    }
+
+    __syncthreads();  // avoid race conditions
+  }
+
+  ////////// parallel reduction in work group
+  __shared__ NumericT shared_array_ApAp[256];
+  __shared__ NumericT shared_array_pAp[256];
+  __shared__ NumericT shared_array_r0Ap[256];
+  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
+  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
+      shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
+    }
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0) {
+    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
+    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+    inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
+  }
+}
+
+
+
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(compressed_matrix<NumericT> const & A,
+                             vector_base<NumericT> const & p,
+                             vector_base<NumericT> & Ap,
+                             vector_base<NumericT> const & r0star,
+                             vector_base<NumericT> & inner_prod_buffer,
+                             vcl_size_t buffer_chunk_size,
+                             vcl_size_t buffer_chunk_offset)
+{
+  unsigned int vec_size     = static_cast<unsigned int>(viennacl::traits::size(p));
+  unsigned int chunk_size   = static_cast<unsigned int>(buffer_chunk_size);
+  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
+  if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads expected to idle
+  {
+    pipelined_bicgstab_csr_vec_mul_blocked_kernel<8,  NumericT><<<256, 256>>>(   // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
+#else
+  if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads expected to idle
+  {
+    pipelined_bicgstab_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>(   // Fermi and Kepler prefer 16 threads per row (half-warp)
+#endif
+                                                                        viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                                        viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                                        viennacl::cuda_arg<NumericT>(A.handle()),
+                                                                        viennacl::cuda_arg(p),
+                                                                        viennacl::cuda_arg(Ap),
+                                                                        viennacl::cuda_arg(r0star),
+                                                                        vec_size,
+                                                                        viennacl::cuda_arg(inner_prod_buffer),
+                                                                        chunk_size,
+                                                                        chunk_offset
+                                                                       );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel");
+  }
+  else
+  {
+    pipelined_bicgstab_csr_vec_mul_adaptive_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                                viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                                viennacl::cuda_arg<unsigned int>(A.handle3()),
+                                                                viennacl::cuda_arg<NumericT>(A.handle()),
+                                                                static_cast<unsigned int>(A.blocks1()),
+                                                                viennacl::cuda_arg(p),
+                                                                viennacl::cuda_arg(Ap),
+                                                                viennacl::cuda_arg(r0star),
+                                                                vec_size,
+                                                                viennacl::cuda_arg(inner_prod_buffer),
+                                                                chunk_size,
+                                                                chunk_offset);
+    VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_csr_vec_mul_adaptive_kernel");
+  }
+}
+
+
+//
+// Coordinate Matrix
+//
+
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
+                                                const NumericT * elements,
+                                                const unsigned int * group_boundaries,
+                                                const NumericT * p,
+                                                NumericT * Ap,
+                                                const NumericT * r0star,
+                                                unsigned int size,
+                                                NumericT * inner_prod_buffer,
+                                                unsigned int buffer_size,
+                                                unsigned int buffer_offset)
+{
+  NumericT inner_prod_ApAp = 0;
+  NumericT inner_prod_pAp  = 0;
+  NumericT inner_prod_r0Ap  = 0;
+  __shared__ unsigned int shared_rows[128];
+  __shared__ NumericT inter_results[128];
+
+  uint2 tmp;
+  NumericT val;
+  unsigned int group_start = group_boundaries[blockIdx.x];
+  unsigned int group_end   = group_boundaries[blockIdx.x + 1];
+  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+  unsigned int local_index = 0;
+
+  for (unsigned int k = 0; k < k_end; ++k)
+  {
+    local_index = group_start + k * blockDim.x + threadIdx.x;
+
+    tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+    val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
+
+    //check for carry from previous loop run:
+    if (threadIdx.x == 0 && k > 0)
+    {
+      if (tmp.x == shared_rows[blockDim.x-1])
+        val += inter_results[blockDim.x-1];
+      else
+      {
+        NumericT Ap_entry = inter_results[blockDim.x-1];
+        Ap[shared_rows[blockDim.x-1]] = Ap_entry;
+        inner_prod_ApAp += Ap_entry * Ap_entry;
+        inner_prod_pAp  += Ap_entry * p[shared_rows[blockDim.x-1]];
+        inner_prod_r0Ap += r0star[shared_rows[blockDim.x-1]] * Ap_entry;
+      }
+    }
+
+    //segmented parallel reduction begin
+    __syncthreads();
+    shared_rows[threadIdx.x] = tmp.x;
+    inter_results[threadIdx.x] = val;
+    NumericT left = 0;
+    __syncthreads();
+
+    for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+    {
+      left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+      __syncthreads();
+      inter_results[threadIdx.x] += left;
+      __syncthreads();
+    }
+    //segmented parallel reduction end
+
+    if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
+        shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
+    {
+      NumericT Ap_entry = inter_results[threadIdx.x];
+      Ap[tmp.x] = Ap_entry;
+      inner_prod_ApAp += Ap_entry * Ap_entry;
+      inner_prod_pAp  += Ap_entry * p[tmp.x];
+      inner_prod_r0Ap += r0star[tmp.x] * Ap_entry;
+    }
+
+    __syncthreads();
+  } //for k
+
+  if (local_index + 1 == group_end)
+  {
+    NumericT Ap_entry = inter_results[threadIdx.x];
+    Ap[tmp.x] = Ap_entry;
+    inner_prod_ApAp += Ap_entry * Ap_entry;
+    inner_prod_pAp  += Ap_entry * p[tmp.x];
+    inner_prod_r0Ap += Ap_entry * r0star[tmp.x];
+  }
+
+  ////////// parallel reduction in work group
+  __shared__ NumericT shared_array_ApAp[256];
+  __shared__ NumericT shared_array_pAp[256];
+  __shared__ NumericT shared_array_r0Ap[256];
+  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
+  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
+      shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
+    }
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0) {
+    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
+    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+    inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
+  }
+
+}
+
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(coordinate_matrix<NumericT> const & A,
+                             vector_base<NumericT> const & p,
+                             vector_base<NumericT> & Ap,
+                             vector_base<NumericT> const & r0star,
+                             vector_base<NumericT> & inner_prod_buffer,
+                             vcl_size_t buffer_chunk_size,
+                             vcl_size_t buffer_chunk_offset)
+{
+  unsigned int vec_size     = static_cast<unsigned int>(viennacl::traits::size(p));
+  unsigned int chunk_size   = static_cast<unsigned int>(buffer_chunk_size);
+  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
+
+  Ap.clear();
+
+  pipelined_bicgstab_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle12()),
+                                                      viennacl::cuda_arg<NumericT>(A.handle()),
+                                                      viennacl::cuda_arg<unsigned int>(A.handle3()),
+                                                      viennacl::cuda_arg(p),
+                                                      viennacl::cuda_arg(Ap),
+                                                      viennacl::cuda_arg(r0star),
+                                                      vec_size,
+                                                      viennacl::cuda_arg(inner_prod_buffer),
+                                                      chunk_size,
+                                                      chunk_offset);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_coo_vec_mul_kernel");
+}
+
+
+
+//
+// ELL Matrix
+//
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int * coords,
+                                                const NumericT * elements,
+                                                unsigned int internal_row_num,
+                                                unsigned int items_per_row,
+                                                const NumericT * p,
+                                                NumericT * Ap,
+                                                const NumericT * r0star,
+                                                unsigned int size,
+                                                NumericT * inner_prod_buffer,
+                                                unsigned int buffer_size,
+                                                unsigned int buffer_offset)
+{
+  NumericT inner_prod_ApAp = 0;
+  NumericT inner_prod_pAp  = 0;
+  NumericT inner_prod_r0Ap  = 0;
+  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int glb_sz = gridDim.x * blockDim.x;
+
+  for (unsigned int row = glb_id; row < size; row += glb_sz)
+  {
+    NumericT sum = 0;
+
+    unsigned int offset = row;
+    for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
+    {
+      NumericT val = elements[offset];
+      sum += val ? p[coords[offset]] * val : NumericT(0);
+    }
+
+    Ap[row] = sum;
+    inner_prod_ApAp += sum * sum;
+    inner_prod_pAp  += sum * p[row];
+    inner_prod_r0Ap += sum * r0star[row];
+  }
+
+  ////////// parallel reduction in work group
+  __shared__ NumericT shared_array_ApAp[256];
+  __shared__ NumericT shared_array_pAp[256];
+  __shared__ NumericT shared_array_r0Ap[256];
+  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
+  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
+      shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
+    }
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0) {
+    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
+    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+    inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
+  }
+}
+
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(ell_matrix<NumericT> const & A,
+                             vector_base<NumericT> const & p,
+                             vector_base<NumericT> & Ap,
+                             vector_base<NumericT> const & r0star,
+                             vector_base<NumericT> & inner_prod_buffer,
+                             vcl_size_t buffer_chunk_size,
+                             vcl_size_t buffer_chunk_offset)
+{
+  unsigned int vec_size     = static_cast<unsigned int>(viennacl::traits::size(p));
+  unsigned int chunk_size   = static_cast<unsigned int>(buffer_chunk_size);
+  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
+
+  pipelined_bicgstab_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                      viennacl::cuda_arg<NumericT>(A.handle()),
+                                                      static_cast<unsigned int>(A.internal_size1()),
+                                                      static_cast<unsigned int>(A.maxnnz()),
+                                                      viennacl::cuda_arg(p),
+                                                      viennacl::cuda_arg(Ap),
+                                                      viennacl::cuda_arg(r0star),
+                                                      vec_size,
+                                                      viennacl::cuda_arg(inner_prod_buffer),
+                                                      chunk_size,
+                                                      chunk_offset);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_ell_vec_mul_kernel");
+}
+
+
+//
+// SELL-C-\sigma Matrix
+//
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned int * columns_per_block,
+                                                             const unsigned int * column_indices,
+                                                             const unsigned int * block_start,
+                                                             const NumericT * elements,
+                                                             const NumericT * p,
+                                                             NumericT * Ap,
+                                                             const NumericT * r0star,
+                                                             unsigned int size,
+                                                             unsigned int block_size,
+                                                             NumericT * inner_prod_buffer,
+                                                             unsigned int buffer_size,
+                                                             unsigned int buffer_offset)
+{
+  NumericT inner_prod_ApAp = 0;
+  NumericT inner_prod_pAp  = 0;
+  NumericT inner_prod_r0Ap  = 0;
+
+  unsigned int blocks_per_threadblock = blockDim.x / block_size;
+  unsigned int id_in_block = threadIdx.x % block_size;
+  unsigned int num_blocks = (size - 1) / block_size + 1;
+  unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
+  unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
+
+  for (unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
+  {
+    unsigned int row         = block_idx * block_size + id_in_block;
+    unsigned int offset      = block_start[block_idx];
+    unsigned int num_columns = columns_per_block[block_idx];
+
+    NumericT sum = 0;
+    for (unsigned int item_id = 0; item_id < num_columns; item_id++)
+    {
+      unsigned int index = offset + item_id * block_size + id_in_block;
+      NumericT val = elements[index];
+
+      sum += val ? (p[column_indices[index]] * val) : 0;
+    }
+
+    if (row < size)
+    {
+      Ap[row] = sum;
+      inner_prod_ApAp += sum * sum;
+      inner_prod_pAp  += sum * p[row];
+      inner_prod_r0Ap += sum * r0star[row];
+    }
+  }
+
+  ////////// parallel reduction in work group
+  __shared__ NumericT shared_array_ApAp[256];
+  __shared__ NumericT shared_array_pAp[256];
+  __shared__ NumericT shared_array_r0Ap[256];
+  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
+  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
+      shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
+    }
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0) {
+    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
+    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+    inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
+  }
+}
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(sliced_ell_matrix<NumericT> const & A,
+                             vector_base<NumericT> const & p,
+                             vector_base<NumericT> & Ap,
+                             vector_base<NumericT> const & r0star,
+                             vector_base<NumericT> & inner_prod_buffer,
+                             vcl_size_t buffer_chunk_size,
+                             vcl_size_t buffer_chunk_offset)
+{
+  unsigned int vec_size     = static_cast<unsigned int>(viennacl::traits::size(p));
+  unsigned int chunk_size   = static_cast<unsigned int>(buffer_chunk_size);
+  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
+
+  pipelined_bicgstab_sliced_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                             viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                             viennacl::cuda_arg<unsigned int>(A.handle3()),
+                                                             viennacl::cuda_arg<NumericT>(A.handle()),
+                                                             viennacl::cuda_arg(p),
+                                                             viennacl::cuda_arg(Ap),
+                                                             viennacl::cuda_arg(r0star),
+                                                             vec_size,
+                                                             static_cast<unsigned int>(A.rows_per_block()),
+                                                             viennacl::cuda_arg(inner_prod_buffer),
+                                                             chunk_size,
+                                                             chunk_offset);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_sliced_ell_vec_mul_kernel");
+}
+
+
+//
+// Hybrid Matrix
+//
+
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int * ell_coords,
+                                                      const NumericT * ell_elements,
+                                                      const unsigned int * csr_rows,
+                                                      const unsigned int * csr_cols,
+                                                      const NumericT * csr_elements,
+                                                      unsigned int internal_row_num,
+                                                      unsigned int items_per_row,
+                                                      const NumericT * p,
+                                                      NumericT * Ap,
+                                                      const NumericT * r0star,
+                                                      unsigned int size,
+                                                      NumericT * inner_prod_buffer,
+                                                      unsigned int buffer_size,
+                                                      unsigned int buffer_offset)
+{
+  NumericT inner_prod_ApAp = 0;
+  NumericT inner_prod_pAp  = 0;
+  NumericT inner_prod_r0Ap  = 0;
+  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int glb_sz = gridDim.x * blockDim.x;
+
+  for (unsigned int row = glb_id; row < size; row += glb_sz)
+  {
+    NumericT sum = 0;
+
+    unsigned int offset = row;
+    for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
+    {
+      NumericT val = ell_elements[offset];
+
+      sum += val ? p[ell_coords[offset]] * val : NumericT(0);
+    }
+
+    unsigned int col_begin = csr_rows[row];
+    unsigned int col_end   = csr_rows[row + 1];
+
+    for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
+    {
+      sum += p[csr_cols[item_id]] * csr_elements[item_id];
+    }
+
+    Ap[row] = sum;
+    inner_prod_ApAp += sum * sum;
+    inner_prod_pAp  += sum * p[row];
+    inner_prod_r0Ap += sum * r0star[row];
+  }
+
+  ////////// parallel reduction in work group
+  __shared__ NumericT shared_array_ApAp[256];
+  __shared__ NumericT shared_array_pAp[256];
+  __shared__ NumericT shared_array_r0Ap[256];
+  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
+  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
+      shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
+    }
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0) {
+    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
+    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+    inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
+  }
+}
+
+
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(hyb_matrix<NumericT> const & A,
+                             vector_base<NumericT> const & p,
+                             vector_base<NumericT> & Ap,
+                             vector_base<NumericT> const & r0star,
+                             vector_base<NumericT> & inner_prod_buffer,
+                             vcl_size_t buffer_chunk_size,
+                             vcl_size_t buffer_chunk_offset)
+{
+  unsigned int vec_size     = static_cast<unsigned int>(viennacl::traits::size(p));
+  unsigned int chunk_size   = static_cast<unsigned int>(buffer_chunk_size);
+  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
+
+  pipelined_bicgstab_hyb_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                      viennacl::cuda_arg<NumericT>(A.handle()),
+                                                      viennacl::cuda_arg<unsigned int>(A.handle3()),
+                                                      viennacl::cuda_arg<unsigned int>(A.handle4()),
+                                                      viennacl::cuda_arg<NumericT>(A.handle5()),
+                                                      static_cast<unsigned int>(A.internal_size1()),
+                                                      static_cast<unsigned int>(A.ell_nnz()),
+                                                      viennacl::cuda_arg(p),
+                                                      viennacl::cuda_arg(Ap),
+                                                      viennacl::cuda_arg(r0star),
+                                                      vec_size,
+                                                      viennacl::cuda_arg(inner_prod_buffer),
+                                                      chunk_size,
+                                                      chunk_offset);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_hyb_vec_mul_kernel");
+}
+
+//////////////////////////////////////////
+
+template <typename T>
+__global__ void pipelined_gmres_normalize_vk_kernel(T * vk,
+                                                    unsigned int vk_offset,
+                                                    T const * residual,
+                                                    T * R_buffer,
+                                                    unsigned int R_offset,
+                                                    T const * inner_prod_buffer,
+                                                    unsigned int chunk_size,
+                                                    T * r_dot_vk_buffer,
+                                                    unsigned int chunk_offset,
+                                                    unsigned int size)
+{
+  __shared__ T shared_array[128];
+  T norm_vk = 0;
+
+  // parallel reduction in work group to compute <vk, vk>
+  shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x + chunk_size];
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+      shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
+  }
+
+  // compute alpha from reduced values:
+  __syncthreads();
+  norm_vk = sqrt(shared_array[0]);
+
+  T inner_prod_contrib = 0;
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
+    T value_vk = vk[i + vk_offset] / norm_vk;
+
+    inner_prod_contrib += residual[i] * value_vk;
+
+    vk[i + vk_offset] = value_vk;
+  }
+  __syncthreads();
+
+  // parallel reduction in work group
+  shared_array[threadIdx.x] = inner_prod_contrib;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+      shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
+  }
+
+  // write results of first reduction stage:
+  if (threadIdx.x == 0)
+    r_dot_vk_buffer[blockIdx.x + chunk_offset] = shared_array[0];
+  // store norm:
+  if (blockDim.x * blockIdx.x + threadIdx.x == 0)
+    R_buffer[R_offset] = norm_vk;
+}
+
+/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
+  *
+  * This routines computes for vectors 'r', 'v_k':
+  *   Second reduction step for ||v_k||
+  *   v_k /= ||v_k||
+  *   First reduction step for <r, v_k>
+  */
+template <typename T>
+void pipelined_gmres_normalize_vk(vector_base<T> & v_k,
+                                  vector_base<T> const & residual,
+                                  vector_base<T> & R_buffer,
+                                  vcl_size_t offset_in_R,
+                                  vector_base<T> const & inner_prod_buffer,
+                                  vector_base<T> & r_dot_vk_buffer,
+                                  vcl_size_t buffer_chunk_size,
+                                  vcl_size_t buffer_chunk_offset)
+{
+  unsigned int vk_offset = viennacl::traits::start(v_k);
+  unsigned int R_offset = offset_in_R;
+  unsigned int chunk_size = buffer_chunk_size;
+  unsigned int chunk_offset = buffer_chunk_offset;
+  unsigned int size = v_k.size();
+
+  pipelined_gmres_normalize_vk_kernel<<<128, 128>>>(viennacl::cuda_arg(v_k),
+                                                    vk_offset,
+                                                    viennacl::cuda_arg(residual),
+                                                    viennacl::cuda_arg(R_buffer),
+                                                    R_offset,
+                                                    viennacl::cuda_arg(inner_prod_buffer),
+                                                    chunk_size,
+                                                    viennacl::cuda_arg(r_dot_vk_buffer),
+                                                    chunk_offset,
+                                                    size);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_normalize_vk_kernel");
+}
+
+
+
+template <typename T>
+__global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const * krylov_basis,
+                                                           unsigned int size,
+                                                           unsigned int internal_size,
+                                                           unsigned int k,
+                                                           T * vi_in_vk_buffer,
+                                                           unsigned int chunk_size)
+{
+  __shared__ T shared_array[7*128];
+  T value_vk = 0;
+
+  unsigned int k_base = 0;
+  while (k_base < k)
+  {
+    unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
+
+    for (unsigned int j=0; j<vecs_in_iteration; ++j)
+      shared_array[threadIdx.x + j*chunk_size] = 0;
+
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+    {
+      value_vk = krylov_basis[i + k * internal_size];
+
+      for (unsigned int j=0; j<vecs_in_iteration; ++j)
+        shared_array[threadIdx.x + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) * internal_size];
+    }
+
+    // parallel reduction in work group
+    for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+    {
+      __syncthreads();
+      if (threadIdx.x < stride) {
+        for (unsigned int j=0; j<vecs_in_iteration; ++j)
+          shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + stride];
+      }
+    }
+
+    // write results to result array
+    if (threadIdx.x == 0)
+      for (unsigned int j=0; j<vecs_in_iteration; ++j)
+        vi_in_vk_buffer[blockIdx.x + (k_base + j) * chunk_size] = shared_array[j*chunk_size];
+
+    k_base += vecs_in_iteration;
+  }
+
+}
+
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
+                                         vcl_size_t v_k_size,
+                                         vcl_size_t v_k_internal_size,
+                                         vcl_size_t param_k,
+                                         vector_base<T> & vi_in_vk_buffer,
+                                         vcl_size_t buffer_chunk_size)
+{
+  unsigned int chunk_size = buffer_chunk_size;
+  unsigned int size = v_k_size;
+  unsigned int internal_size = v_k_internal_size;
+  unsigned int k = param_k;
+
+  pipelined_gmres_gram_schmidt_stage1_kernel<<<128, 128>>>(viennacl::cuda_arg(device_krylov_basis),
+                                                           size,
+                                                           internal_size,
+                                                           k,
+                                                           viennacl::cuda_arg(vi_in_vk_buffer),
+                                                           chunk_size);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage1_kernel");
+}
+
+
+
+
+template <typename T>
+__global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T * krylov_basis,
+                                                           unsigned int size,
+                                                           unsigned int internal_size,
+                                                           unsigned int k,
+                                                           T const * vi_in_vk_buffer,
+                                                           unsigned int chunk_size,
+                                                           T * R_buffer,
+                                                           unsigned int krylov_dim,
+                                                           T * inner_prod_buffer)
+{
+  __shared__ T shared_array[7*128];
+  T vk_dot_vk = 0;
+  T value_vk = 0;
+
+  unsigned int k_base = 0;
+  while (k_base < k)
+  {
+    unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
+
+    // parallel reduction in work group for <v_i, v_k>
+    for (unsigned int j=0; j<vecs_in_iteration; ++j)
+      shared_array[threadIdx.x + j*chunk_size] = vi_in_vk_buffer[threadIdx.x + (k_base + j) * chunk_size];
+    for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+    {
+      __syncthreads();
+      if (threadIdx.x < stride) {
+        for (unsigned int j=0; j<vecs_in_iteration; ++j)
+          shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + stride];
+      }
+    }
+    __syncthreads();
+
+    // v_k -= <v_i, v_k> v_i:
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+    {
+      value_vk = krylov_basis[i + k * internal_size];
+
+      for (unsigned int j=0; j<vecs_in_iteration; ++j)
+        value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size];
+      vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0;
+      krylov_basis[i + k * internal_size] = value_vk;
+    }
+
+    // write to R: (to avoid thread divergence, all threads write the same value)
+    if (blockIdx.x == 0)
+      for (unsigned int j=0; j<vecs_in_iteration; ++j)
+        R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size];
+    __syncthreads();
+
+    k_base += vecs_in_iteration;
+  }
+
+  // parallel reduction in work group for <v_k, v_k>
+  shared_array[threadIdx.x] = vk_dot_vk;
+  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+      shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
+  }
+
+  // write results to result array
+  if (threadIdx.x == 0)
+    inner_prod_buffer[chunk_size+blockIdx.x] = shared_array[0];
+}
+
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis,
+                                         vcl_size_t v_k_size,
+                                         vcl_size_t v_k_internal_size,
+                                         vcl_size_t param_k,
+                                         vector_base<T> const & vi_in_vk_buffer,
+                                         vector_base<T> & R_buffer,
+                                         vcl_size_t krylov_dim,
+                                         vector_base<T> & inner_prod_buffer,
+                                         vcl_size_t buffer_chunk_size)
+{
+  unsigned int chunk_size = buffer_chunk_size;
+  unsigned int size = v_k_size;
+  unsigned int internal_size = v_k_internal_size;
+  unsigned int k = param_k;
+  unsigned int krylov = krylov_dim;
+
+  pipelined_gmres_gram_schmidt_stage2_kernel<<<128, 128>>>(viennacl::cuda_arg(device_krylov_basis),
+                                                           size,
+                                                           internal_size,
+                                                           k,
+                                                           viennacl::cuda_arg(vi_in_vk_buffer),
+                                                           chunk_size,
+                                                           viennacl::cuda_arg(R_buffer),
+                                                           krylov,
+                                                           viennacl::cuda_arg(inner_prod_buffer));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage2_kernel");
+}
+
+
+
+
+template <typename T>
+__global__ void pipelined_gmres_update_result_kernel(T * result,
+                                                     T const * residual,
+                                                     T const * krylov_basis,
+                                                     unsigned int size,
+                                                     unsigned int internal_size,
+                                                     T const * coefficients,
+                                                     unsigned int k)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+  {
+    T value_result = result[i] + coefficients[0] * residual[i];
+
+    for (unsigned int j = 1; j < k; ++j)
+      value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size];
+
+    result[i] = value_result;
+  }
+}
+
+template <typename T>
+void pipelined_gmres_update_result(vector_base<T> & result,
+                                   vector_base<T> const & residual,
+                                   vector_base<T> const & krylov_basis,
+                                   vcl_size_t v_k_size,
+                                   vcl_size_t v_k_internal_size,
+                                   vector_base<T> const & coefficients,
+                                   vcl_size_t param_k)
+{
+  unsigned int size = v_k_size;
+  unsigned int internal_size = v_k_internal_size;
+  unsigned int k = param_k;
+
+  pipelined_gmres_update_result_kernel<<<128, 128>>>(viennacl::cuda_arg(result),
+                                                     viennacl::cuda_arg(residual),
+                                                     viennacl::cuda_arg(krylov_basis),
+                                                     size,
+                                                     internal_size,
+                                                     viennacl::cuda_arg(coefficients),
+                                                     k);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_update_result_kernel");
+}
+
+
+
+template <typename NumericT>
+void pipelined_gmres_prod(compressed_matrix<NumericT> const & A,
+                          vector_base<NumericT> const & p,
+                          vector_base<NumericT> & Ap,
+                          vector_base<NumericT> & inner_prod_buffer)
+{
+  unsigned int size = p.size();
+  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
+  if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads expected to idle
+  {
+    pipelined_cg_csr_vec_mul_blocked_kernel<8,  NumericT><<<256, 256>>>(   // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
+#else
+  if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads expected to idle
+  {
+    pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<128, 256>>>(   // Fermi and Kepler prefer 16 threads per row (half-warp)
+#endif
+                                                                        viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                                        viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                                        viennacl::cuda_arg<NumericT>(A.handle()),
+                                                                        viennacl::cuda_arg(p) + viennacl::traits::start(p),
+                                                                        viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap),
+                                                                        size,
+                                                                        viennacl::cuda_arg(inner_prod_buffer),
+                                                                        buffer_size_per_vector
+                                                                       );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel");
+  }
+  else
+  {
+    pipelined_cg_csr_vec_mul_adaptive_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                           viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                           viennacl::cuda_arg<unsigned int>(A.handle3()),
+                                                           viennacl::cuda_arg<NumericT>(A.handle()),
+                                                           static_cast<unsigned int>(A.blocks1()),
+                                                           viennacl::cuda_arg(p) + viennacl::traits::start(p),
+                                                           viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap),
+                                                           size,
+                                                           viennacl::cuda_arg(inner_prod_buffer),
+                                                           buffer_size_per_vector);
+    VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_adaptive_kernel");
+  }
+
+}
+
+template <typename T>
+void pipelined_gmres_prod(coordinate_matrix<T> const & A,
+                          vector_base<T> const & p,
+                          vector_base<T> & Ap,
+                          vector_base<T> & inner_prod_buffer)
+{
+  unsigned int size = p.size();
+  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+  Ap.clear();
+
+  pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle12()),
+                                                viennacl::cuda_arg<T>(A.handle()),
+                                                viennacl::cuda_arg<unsigned int>(A.handle3()),
+                                                viennacl::cuda_arg(p) + viennacl::traits::start(p),
+                                                viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap),
+                                                size,
+                                                viennacl::cuda_arg(inner_prod_buffer),
+                                                buffer_size_per_vector);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel");
+}
+
+template <typename T>
+void pipelined_gmres_prod(ell_matrix<T> const & A,
+                          vector_base<T> const & p,
+                          vector_base<T> & Ap,
+                          vector_base<T> & inner_prod_buffer)
+{
+  unsigned int size = p.size();
+  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+  pipelined_cg_ell_vec_mul_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                viennacl::cuda_arg<T>(A.handle()),
+                                                static_cast<unsigned int>(A.internal_size1()),
+                                                static_cast<unsigned int>(A.maxnnz()),
+                                                viennacl::cuda_arg(p) + viennacl::traits::start(p),
+                                                viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap),
+                                                size,
+                                                viennacl::cuda_arg(inner_prod_buffer),
+                                                buffer_size_per_vector);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel");
+}
+
+template <typename T>
+void pipelined_gmres_prod(sliced_ell_matrix<T> const & A,
+                          vector_base<T> const & p,
+                          vector_base<T> & Ap,
+                          vector_base<T> & inner_prod_buffer)
+{
+  unsigned int size = p.size();
+  unsigned int buffe

<TRUNCATED>

[29/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm_rmerge.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm_rmerge.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm_rmerge.hpp
new file mode 100644
index 0000000..6ac8e09
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm_rmerge.hpp
@@ -0,0 +1,669 @@
+#ifndef VIENNACL_LINALG_CUDA_SPGEMM_RMERGE_HPP_
+#define VIENNACL_LINALG_CUDA_SPGEMM_RMERGE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/sparse_matrix_operations.hpp
+    @brief Implementations of operations using sparse matrices using CUDA
+*/
+
+#include <stdexcept>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+#include "viennacl/tools/timer.hpp"
+
+#include "viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+/** @brief Loads a value from the specified address. With CUDA arch 3.5 and above the value is also stored in global constant memory for later reuse */
+template<typename NumericT>
+static inline __device__ NumericT load_and_cache(const NumericT *address)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return __ldg(address);
+#else
+  return *address;
+#endif
+}
+
+
+//
+// Stage 1: Obtain upper bound for number of elements per row in C:
+//
+template<typename IndexT>
+__device__ IndexT round_to_next_power_of_2(IndexT val)
+{
+  if (val > 32)
+    return 64; // just to indicate that we need to split/factor the matrix!
+  else if (val > 16)
+    return 32;
+  else if (val > 8)
+    return 16;
+  else if (val > 4)
+    return 8;
+  else if (val > 2)
+    return 4;
+  else if (val > 1)
+    return 2;
+  else
+    return 1;
+}
+
+template<typename IndexT>
+__global__ void compressed_matrix_gemm_stage_1(
+          const IndexT * A_row_indices,
+          const IndexT * A_col_indices,
+          IndexT A_size1,
+          const IndexT * B_row_indices,
+          IndexT *subwarpsize_per_group,
+          IndexT *max_nnz_row_A_per_group,
+          IndexT *max_nnz_row_B_per_group)
+{
+  unsigned int subwarpsize_in_thread = 0;
+  unsigned int max_nnz_row_A = 0;
+  unsigned int max_nnz_row_B = 0;
+
+  unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
+  unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
+
+  for (unsigned int row = rows_per_group * blockIdx.x + threadIdx.x; row < row_per_group_end; row += blockDim.x)
+  {
+    unsigned int A_row_start = A_row_indices[row];
+    unsigned int A_row_end   = A_row_indices[row+1];
+    unsigned int row_num = A_row_end - A_row_start;
+    subwarpsize_in_thread = max(A_row_end - A_row_start, subwarpsize_in_thread);
+    max_nnz_row_A = max(max_nnz_row_A, row_num);
+    for (unsigned int j = A_row_start; j < A_row_end; ++j)
+    {
+      unsigned int col = A_col_indices[j];
+      unsigned int row_len_B = B_row_indices[col + 1] - B_row_indices[col];
+      max_nnz_row_B = max(row_len_B, max_nnz_row_B);
+    }
+  }
+
+  // reduction to obtain maximum in thread block
+  __shared__ unsigned int shared_subwarpsize[256];
+  __shared__ unsigned int shared_max_nnz_row_A[256];
+  __shared__ unsigned int shared_max_nnz_row_B[256];
+
+    shared_subwarpsize[threadIdx.x] = subwarpsize_in_thread;
+  shared_max_nnz_row_A[threadIdx.x] = max_nnz_row_A;
+  shared_max_nnz_row_B[threadIdx.x] = max_nnz_row_B;
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+        shared_subwarpsize[threadIdx.x] = max(  shared_subwarpsize[threadIdx.x],   shared_subwarpsize[threadIdx.x + stride]);
+      shared_max_nnz_row_A[threadIdx.x] = max(shared_max_nnz_row_A[threadIdx.x], shared_max_nnz_row_A[threadIdx.x + stride]);
+      shared_max_nnz_row_B[threadIdx.x] = max(shared_max_nnz_row_B[threadIdx.x], shared_max_nnz_row_B[threadIdx.x + stride]);
+    }
+  }
+
+  if (threadIdx.x == 0)
+  {
+      subwarpsize_per_group[blockIdx.x] = round_to_next_power_of_2(shared_subwarpsize[0]);
+    max_nnz_row_A_per_group[blockIdx.x] = shared_max_nnz_row_A[0];
+    max_nnz_row_B_per_group[blockIdx.x] = shared_max_nnz_row_B[0];
+  }
+}
+
+//
+// Stage 2: Determine sparsity pattern of C
+//
+
+// Using warp shuffle routines (CUDA arch 3.5)
+template<unsigned int SubWarpSizeV, typename IndexT>
+__device__ IndexT subwarp_minimum_shuffle(IndexT min_index)
+{
+  for (unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
+    min_index = min(min_index, __shfl_xor((int)min_index, (int)i));
+  return min_index;
+}
+
+// Using shared memory
+template<unsigned int SubWarpSizeV, typename IndexT>
+__device__ IndexT subwarp_minimum_shared(IndexT min_index, IndexT id_in_warp, IndexT *shared_buffer)
+{
+  shared_buffer[threadIdx.x] = min_index;
+  for (unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
+    shared_buffer[threadIdx.x] = min(shared_buffer[threadIdx.x], shared_buffer[(threadIdx.x + i) % 512]);
+  return shared_buffer[threadIdx.x - id_in_warp];
+}
+
+
+template<unsigned int SubWarpSizeV, typename IndexT>
+__global__ void compressed_matrix_gemm_stage_2(
+          const IndexT * A_row_indices,
+          const IndexT * A_col_indices,
+          IndexT A_size1,
+          const IndexT * B_row_indices,
+          const IndexT * B_col_indices,
+          IndexT B_size2,
+          IndexT * C_row_indices)
+{
+  __shared__ unsigned int shared_buffer[512];
+
+  unsigned int num_warps  =  blockDim.x / SubWarpSizeV;
+  unsigned int warp_id    = threadIdx.x / SubWarpSizeV;
+  unsigned int id_in_warp = threadIdx.x % SubWarpSizeV;
+
+  unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
+  unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
+
+  for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps)
+  {
+    unsigned int row_A_start = A_row_indices[row];
+    unsigned int row_A_end   = A_row_indices[row+1];
+
+    unsigned int my_row_B = row_A_start + id_in_warp;
+    unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
+    unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
+    unsigned int row_B_end   = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
+
+    unsigned int num_nnz = 0;
+    if (row_A_end - row_A_start > 1) // zero or no row can be processed faster
+    {
+      unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
+
+      while (1)
+      {
+        // determine current minimum (warp shuffle)
+        unsigned int min_index = current_front_index;
+        min_index = subwarp_minimum_shared<SubWarpSizeV>(min_index, id_in_warp, shared_buffer);
+
+        if (min_index == B_size2)
+          break;
+
+        // update front:
+        if (current_front_index == min_index)
+        {
+          ++row_B_start;
+          current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
+        }
+
+        ++num_nnz;
+      }
+    }
+    else
+    {
+      num_nnz = row_B_end - row_B_start;
+    }
+
+    if (id_in_warp == 0)
+      C_row_indices[row] = num_nnz;
+  }
+
+}
+
+
+//
+// Stage 3: Fill C with values
+//
+
+// Using warp shuffle routines (CUDA arch 3.5)
+template<unsigned int SubWarpSizeV, typename NumericT>
+__device__ NumericT subwarp_accumulate_shuffle(NumericT output_value)
+{
+  for (unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
+    output_value += __shfl_xor((int)output_value, (int)i);
+  return output_value;
+}
+
+// Using shared memory
+template<unsigned int SubWarpSizeV, typename NumericT>
+__device__ NumericT subwarp_accumulate_shared(NumericT output_value, unsigned int id_in_warp, NumericT *shared_buffer)
+{
+  shared_buffer[threadIdx.x] = output_value;
+  for (unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
+    shared_buffer[threadIdx.x] += shared_buffer[(threadIdx.x + i) % 512];
+  return shared_buffer[threadIdx.x - id_in_warp];
+}
+
+
+template<unsigned int SubWarpSizeV, typename IndexT, typename NumericT>
+__global__ void compressed_matrix_gemm_stage_3(
+          const IndexT * A_row_indices,
+          const IndexT * A_col_indices,
+          const NumericT * A_elements,
+          IndexT A_size1,
+          const IndexT * B_row_indices,
+          const IndexT * B_col_indices,
+          const NumericT * B_elements,
+          IndexT B_size2,
+          IndexT const * C_row_indices,
+          IndexT * C_col_indices,
+          NumericT * C_elements)
+{
+  __shared__ unsigned int shared_indices[512];
+  __shared__ NumericT     shared_values[512];
+
+  unsigned int num_warps  =  blockDim.x / SubWarpSizeV;
+  unsigned int warp_id    = threadIdx.x / SubWarpSizeV;
+  unsigned int id_in_warp = threadIdx.x % SubWarpSizeV;
+
+  unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
+  unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
+
+  for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps)
+  {
+    unsigned int row_A_start = A_row_indices[row];
+    unsigned int row_A_end   = A_row_indices[row+1];
+
+    unsigned int my_row_B = row_A_start + ((row_A_end - row_A_start > 1) ? id_in_warp : 0); // special case: single row
+    unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
+    unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index)     : 0;
+    unsigned int row_B_end   = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
+    NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
+
+    unsigned int index_in_C = C_row_indices[row];
+
+    if (row_A_end - row_A_start > 1)
+    {
+      unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
+      NumericT     current_front_value = (row_B_start < row_B_end) ? load_and_cache(B_elements    + row_B_start) : 0;
+
+      unsigned int index_buffer = 0;
+      NumericT     value_buffer = 0;
+      unsigned int buffer_size = 0;
+      while (1)
+      {
+        // determine current minimum:
+        unsigned int min_index = subwarp_minimum_shared<SubWarpSizeV>(current_front_index, id_in_warp, shared_indices);
+
+        if (min_index == B_size2) // done
+          break;
+
+        // compute entry in C:
+        NumericT output_value = (current_front_index == min_index) ? val_A * current_front_value : 0;
+        output_value = subwarp_accumulate_shared<SubWarpSizeV>(output_value, id_in_warp, shared_values);
+
+        // update front:
+        if (current_front_index == min_index)
+        {
+          ++row_B_start;
+          current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
+          current_front_value = (row_B_start < row_B_end) ? load_and_cache(B_elements    + row_B_start) : 0;
+        }
+
+        // write current front to register buffer:
+        index_buffer = (id_in_warp == buffer_size) ? min_index    : index_buffer;
+        value_buffer = (id_in_warp == buffer_size) ? output_value : value_buffer;
+        ++buffer_size;
+
+        // flush register buffer via a coalesced write once full:
+        if (buffer_size == SubWarpSizeV)
+        {
+          C_col_indices[index_in_C + id_in_warp] = index_buffer;
+          C_elements[index_in_C + id_in_warp]    = value_buffer;
+        }
+
+        index_in_C += (buffer_size == SubWarpSizeV) ? SubWarpSizeV : 0;
+        buffer_size = (buffer_size == SubWarpSizeV) ?           0  : buffer_size;
+      }
+
+      // write remaining entries in register buffer to C:
+      if (id_in_warp < buffer_size)
+      {
+        C_col_indices[index_in_C + id_in_warp] = index_buffer;
+        C_elements[index_in_C + id_in_warp]  = value_buffer;
+      }
+    }
+    else // write respective row using the full subwarp:
+    {
+      for (unsigned int i = row_B_start + id_in_warp; i < row_B_end; i += SubWarpSizeV)
+      {
+        C_col_indices[index_in_C + id_in_warp] = load_and_cache(B_col_indices + i);
+        C_elements[index_in_C + id_in_warp]    = val_A * load_and_cache(B_elements    + i);
+        index_in_C += SubWarpSizeV;
+      }
+    }
+
+  }
+
+}
+
+
+
+//
+// Decomposition kernels:
+//
+template<typename IndexT>
+__global__ void compressed_matrix_gemm_decompose_1(
+          const IndexT * A_row_indices,
+          IndexT A_size1,
+          IndexT max_per_row,
+          IndexT *chunks_per_row)
+{
+  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
+  {
+    IndexT num_entries = A_row_indices[i+1] - A_row_indices[i];
+    chunks_per_row[i] = (num_entries < max_per_row) ? 1 : ((num_entries - 1)/ max_per_row + 1);
+  }
+}
+
+
+template<typename IndexT, typename NumericT>
+__global__ void compressed_matrix_gemm_A2(
+          IndexT * A2_row_indices,
+          IndexT * A2_col_indices,
+          NumericT * A2_elements,
+          IndexT A2_size1,
+          IndexT *new_row_buffer)
+{
+  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A2_size1; i += blockDim.x * gridDim.x)
+  {
+    unsigned int index_start = new_row_buffer[i];
+    unsigned int index_stop  = new_row_buffer[i+1];
+
+    A2_row_indices[i] = index_start;
+
+    for (IndexT j = index_start; j < index_stop; ++j)
+    {
+      A2_col_indices[j] = j;
+      A2_elements[j] = NumericT(1);
+    }
+  }
+
+  // write last entry in row_buffer with global thread 0:
+  if (threadIdx.x == 0 && blockIdx.x == 0)
+    A2_row_indices[A2_size1] = new_row_buffer[A2_size1];
+}
+
+template<typename IndexT, typename NumericT>
+__global__ void compressed_matrix_gemm_G1(
+          IndexT * G1_row_indices,
+          IndexT * G1_col_indices,
+          NumericT * G1_elements,
+          IndexT G1_size1,
+          IndexT const *A_row_indices,
+          IndexT const *A_col_indices,
+          NumericT const *A_elements,
+          IndexT A_size1,
+          IndexT A_nnz,
+          IndexT max_per_row,
+          IndexT *new_row_buffer)
+{
+  // Part 1: Copy column indices and entries:
+  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_nnz; i += blockDim.x * gridDim.x)
+  {
+    G1_col_indices[i] = A_col_indices[i];
+    G1_elements[i]    = A_elements[i];
+  }
+
+  // Part 2: Derive new row indicies:
+  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
+  {
+    unsigned int old_start = A_row_indices[i];
+    unsigned int new_start = new_row_buffer[i];
+    unsigned int row_chunks = new_row_buffer[i+1] - new_start;
+
+    for (IndexT j=0; j<row_chunks; ++j)
+      G1_row_indices[new_start + j] = old_start + j * max_per_row;
+  }
+
+  // write last entry in row_buffer with global thread 0:
+  if (threadIdx.x == 0 && blockIdx.x == 0)
+    G1_row_indices[G1_size1] = A_row_indices[A_size1];
+}
+
+
+
+/** @brief Carries out sparse_matrix-sparse_matrix multiplication for CSR matrices
+*
+* Implementation of the convenience expression C = prod(A, B);
+* Based on computing C(i, :) = A(i, :) * B via merging the respective rows of B
+*
+* @param A     Left factor
+* @param B     Right factor
+* @param C     Result matrix
+*/
+template<class NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+               viennacl::compressed_matrix<NumericT, AlignmentV> const & B,
+               viennacl::compressed_matrix<NumericT, AlignmentV> & C)
+{
+  C.resize(A.size1(), B.size2(), false);
+
+  unsigned int blocknum = 256;
+  unsigned int threadnum = 128;
+
+  viennacl::vector<unsigned int> subwarp_sizes(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+  viennacl::vector<unsigned int> max_nnz_row_A(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+  viennacl::vector<unsigned int> max_nnz_row_B(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+
+  //
+  // Stage 1: Determine upper bound for number of nonzeros
+  //
+  compressed_matrix_gemm_stage_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                          viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                          static_cast<unsigned int>(A.size1()),
+                                                          viennacl::cuda_arg<unsigned int>(B.handle1()),
+                                                          viennacl::cuda_arg(subwarp_sizes),
+                                                          viennacl::cuda_arg(max_nnz_row_A),
+                                                          viennacl::cuda_arg(max_nnz_row_B)
+                                                         );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_1");
+
+  subwarp_sizes.switch_memory_context(viennacl::context(MAIN_MEMORY));
+  unsigned int * subwarp_sizes_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(subwarp_sizes.handle());
+
+  max_nnz_row_A.switch_memory_context(viennacl::context(MAIN_MEMORY));
+  unsigned int const * max_nnz_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_A.handle());
+
+  max_nnz_row_B.switch_memory_context(viennacl::context(MAIN_MEMORY));
+  unsigned int const * max_nnz_row_B_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_B.handle());
+
+  unsigned int max_subwarp_size = 0;
+  //std::cout << "Scratchpad offsets: " << std::endl;
+  for (std::size_t i=0; i<subwarp_sizes.size(); ++i)
+    max_subwarp_size = std::max(max_subwarp_size, subwarp_sizes_ptr[i]);
+  unsigned int A_max_nnz_per_row = 0;
+  for (std::size_t i=0; i<max_nnz_row_A.size(); ++i)
+    A_max_nnz_per_row = std::max(A_max_nnz_per_row, max_nnz_row_A_ptr[i]);
+
+  if (max_subwarp_size > 32)
+  {
+    // determine augmented size:
+    unsigned int max_entries_in_G = 32;
+    if (A_max_nnz_per_row <= 256)
+      max_entries_in_G = 16;
+    if (A_max_nnz_per_row <= 64)
+      max_entries_in_G = 8;
+
+    viennacl::vector<unsigned int> exclusive_scan_helper(A.size1() + 1, viennacl::traits::context(A));
+    compressed_matrix_gemm_decompose_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                                static_cast<unsigned int>(A.size1()),
+                                                                static_cast<unsigned int>(max_entries_in_G),
+                                                                viennacl::cuda_arg(exclusive_scan_helper)
+                                                               );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_decompose_1");
+
+    viennacl::linalg::exclusive_scan(exclusive_scan_helper);
+    unsigned int augmented_size = exclusive_scan_helper[A.size1()];
+
+    // split A = A2 * G1
+    viennacl::compressed_matrix<NumericT, AlignmentV> A2(A.size1(), augmented_size, augmented_size, viennacl::traits::context(A));
+    viennacl::compressed_matrix<NumericT, AlignmentV> G1(augmented_size, A.size2(),        A.nnz(), viennacl::traits::context(A));
+
+    // fill A2:
+    compressed_matrix_gemm_A2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A2.handle1()),
+                                                       viennacl::cuda_arg<unsigned int>(A2.handle2()),
+                                                       viennacl::cuda_arg<NumericT>(A2.handle()),
+                                                       static_cast<unsigned int>(A2.size1()),
+                                                       viennacl::cuda_arg(exclusive_scan_helper)
+                                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_A2");
+
+    // fill G1:
+    compressed_matrix_gemm_G1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(G1.handle1()),
+                                                       viennacl::cuda_arg<unsigned int>(G1.handle2()),
+                                                       viennacl::cuda_arg<NumericT>(G1.handle()),
+                                                       static_cast<unsigned int>(G1.size1()),
+                                                       viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                       viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                       viennacl::cuda_arg<NumericT>(A.handle()),
+                                                       static_cast<unsigned int>(A.size1()),
+                                                       static_cast<unsigned int>(A.nnz()),
+                                                       static_cast<unsigned int>(max_entries_in_G),
+                                                       viennacl::cuda_arg(exclusive_scan_helper)
+                                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_G1");
+
+    // compute tmp = G1 * B;
+    // C = A2 * tmp;
+    viennacl::compressed_matrix<NumericT, AlignmentV> tmp(G1.size1(), B.size2(), 0, viennacl::traits::context(A));
+    prod_impl(G1, B, tmp); // this runs a standard RMerge without decomposition of G1
+    prod_impl(A2, tmp, C); // this may split A2 again
+    return;
+  }
+
+  //std::cout << "Running RMerge with subwarp size " << max_subwarp_size << std::endl;
+
+  subwarp_sizes.switch_memory_context(viennacl::traits::context(A));
+  max_nnz_row_A.switch_memory_context(viennacl::traits::context(A));
+  max_nnz_row_B.switch_memory_context(viennacl::traits::context(A));
+
+  //
+  // Stage 2: Determine pattern of C
+  //
+
+  if (max_subwarp_size == 32)
+  {
+    compressed_matrix_gemm_stage_2<32><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                           viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                           static_cast<unsigned int>(A.size1()),
+                                                           viennacl::cuda_arg<unsigned int>(B.handle1()),
+                                                           viennacl::cuda_arg<unsigned int>(B.handle2()),
+                                                           static_cast<unsigned int>(B.size2()),
+                                                           viennacl::cuda_arg<unsigned int>(C.handle1())
+                                                          );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_2");
+  }
+  else if (max_subwarp_size == 16)
+  {
+    compressed_matrix_gemm_stage_2<16><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                           viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                           static_cast<unsigned int>(A.size1()),
+                                                           viennacl::cuda_arg<unsigned int>(B.handle1()),
+                                                           viennacl::cuda_arg<unsigned int>(B.handle2()),
+                                                           static_cast<unsigned int>(B.size2()),
+                                                           viennacl::cuda_arg<unsigned int>(C.handle1())
+                                                          );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_2");
+  }
+  else
+  {
+    compressed_matrix_gemm_stage_2<8><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                           viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                           static_cast<unsigned int>(A.size1()),
+                                                           viennacl::cuda_arg<unsigned int>(B.handle1()),
+                                                           viennacl::cuda_arg<unsigned int>(B.handle2()),
+                                                           static_cast<unsigned int>(B.size2()),
+                                                           viennacl::cuda_arg<unsigned int>(C.handle1())
+                                                          );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_2");
+  }
+
+  // exclusive scan on C.handle1(), ultimately allowing to allocate remaining memory for C
+  viennacl::backend::typesafe_host_array<unsigned int> row_buffer(C.handle1(), C.size1() + 1);
+  viennacl::backend::memory_read(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+  unsigned int current_offset = 0;
+  for (std::size_t i=0; i<C.size1(); ++i)
+  {
+    unsigned int tmp = row_buffer[i];
+    row_buffer.set(i, current_offset);
+    current_offset += tmp;
+  }
+  row_buffer.set(C.size1(), current_offset);
+  viennacl::backend::memory_write(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+
+
+  //
+  // Stage 3: Compute entries in C
+  //
+  C.reserve(current_offset, false);
+
+  if (max_subwarp_size == 32)
+  {
+    compressed_matrix_gemm_stage_3<32><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                            viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                            viennacl::cuda_arg<NumericT>(A.handle()),
+                                                            static_cast<unsigned int>(A.size1()),
+                                                            viennacl::cuda_arg<unsigned int>(B.handle1()),
+                                                            viennacl::cuda_arg<unsigned int>(B.handle2()),
+                                                            viennacl::cuda_arg<NumericT>(B.handle()),
+                                                            static_cast<unsigned int>(B.size2()),
+                                                            viennacl::cuda_arg<unsigned int>(C.handle1()),
+                                                            viennacl::cuda_arg<unsigned int>(C.handle2()),
+                                                            viennacl::cuda_arg<NumericT>(C.handle())
+                                                           );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_3");
+  }
+  else if (max_subwarp_size == 16)
+  {
+    compressed_matrix_gemm_stage_3<16><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                            viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                            viennacl::cuda_arg<NumericT>(A.handle()),
+                                                            static_cast<unsigned int>(A.size1()),
+                                                            viennacl::cuda_arg<unsigned int>(B.handle1()),
+                                                            viennacl::cuda_arg<unsigned int>(B.handle2()),
+                                                            viennacl::cuda_arg<NumericT>(B.handle()),
+                                                            static_cast<unsigned int>(B.size2()),
+                                                            viennacl::cuda_arg<unsigned int>(C.handle1()),
+                                                            viennacl::cuda_arg<unsigned int>(C.handle2()),
+                                                            viennacl::cuda_arg<NumericT>(C.handle())
+                                                           );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_3");
+  }
+  else
+  {
+    compressed_matrix_gemm_stage_3<8><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                                            viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                                            viennacl::cuda_arg<NumericT>(A.handle()),
+                                                            static_cast<unsigned int>(A.size1()),
+                                                            viennacl::cuda_arg<unsigned int>(B.handle1()),
+                                                            viennacl::cuda_arg<unsigned int>(B.handle2()),
+                                                            viennacl::cuda_arg<NumericT>(B.handle()),
+                                                            static_cast<unsigned int>(B.size2()),
+                                                            viennacl::cuda_arg<unsigned int>(C.handle1()),
+                                                            viennacl::cuda_arg<unsigned int>(C.handle2()),
+                                                            viennacl::cuda_arg<NumericT>(C.handle())
+                                                           );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_3");
+  }
+
+}
+
+} // namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif


[15/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/matrix_operations.hpp
new file mode 100644
index 0000000..c9dec88
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/matrix_operations.hpp
@@ -0,0 +1,1303 @@
+#ifndef VIENNACL_LINALG_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/matrix_operations.hpp
+    @brief Implementations of dense matrix related operations including matrix-vector products.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/host_based/matrix_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/matrix_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/matrix_operations.hpp"
+#endif
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    template<typename DestNumericT, typename SrcNumericT>
+    void convert(matrix_base<DestNumericT> & dest, matrix_base<SrcNumericT> const & src)
+    {
+      assert(viennacl::traits::size1(dest) == viennacl::traits::size1(src) && bool("Incompatible matrix sizes in m1 = m2 (convert): size1(m1) != size1(m2)"));
+      assert(viennacl::traits::size2(dest) == viennacl::traits::size2(src) && bool("Incompatible matrix sizes in m1 = m2 (convert): size2(m1) != size2(m2)"));
+
+      switch (viennacl::traits::handle(dest).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::convert(dest, src);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::convert(dest, src);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::convert(dest, src);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    template<typename NumericT,
+              typename SizeT, typename DistanceT>
+    void trans(const matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>,const matrix_base<NumericT, SizeT, DistanceT>, op_trans> & proxy,
+              matrix_base<NumericT> & temp_trans)
+    {
+      switch (viennacl::traits::handle(proxy).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::trans(proxy, temp_trans);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::trans(proxy,temp_trans);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::trans(proxy,temp_trans);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    template<typename NumericT,
+              typename ScalarType1>
+    void am(matrix_base<NumericT> & mat1,
+            matrix_base<NumericT> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+    {
+      switch (viennacl::traits::handle(mat1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::am(mat1, mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::am(mat1, mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::am(mat1, mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    template<typename NumericT,
+              typename ScalarType1, typename ScalarType2>
+    void ambm(matrix_base<NumericT> & mat1,
+              matrix_base<NumericT> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+              matrix_base<NumericT> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+    {
+      switch (viennacl::traits::handle(mat1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::ambm(mat1,
+                                             mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                             mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::ambm(mat1,
+                                         mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                         mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::ambm(mat1,
+                                       mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                       mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    template<typename NumericT,
+              typename ScalarType1, typename ScalarType2>
+    void ambm_m(matrix_base<NumericT> & mat1,
+                matrix_base<NumericT> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                matrix_base<NumericT> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+    {
+      switch (viennacl::traits::handle(mat1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::ambm_m(mat1,
+                                               mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                               mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::ambm_m(mat1,
+                                           mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                           mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::ambm_m(mat1,
+                                         mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                         mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    template<typename NumericT>
+    void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
+    {
+      switch (viennacl::traits::handle(mat).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::matrix_assign(mat, s, clear);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::matrix_assign(mat, s, clear);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::matrix_assign(mat, s, clear);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    template<typename NumericT>
+    void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
+    {
+      switch (viennacl::traits::handle(mat).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::matrix_diagonal_assign(mat, s);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::matrix_diagonal_assign(mat, s);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::matrix_diagonal_assign(mat, s);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    /** @brief Dispatcher interface for A = diag(v, k) */
+    template<typename NumericT>
+    void matrix_diag_from_vector(const vector_base<NumericT> & v, int k, matrix_base<NumericT> & A)
+    {
+      switch (viennacl::traits::handle(v).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::matrix_diag_from_vector(v, k, A);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::matrix_diag_from_vector(v, k, A);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::matrix_diag_from_vector(v, k, A);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    /** @brief Dispatcher interface for v = diag(A, k) */
+    template<typename NumericT>
+    void matrix_diag_to_vector(const matrix_base<NumericT> & A, int k, vector_base<NumericT> & v)
+    {
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::matrix_diag_to_vector(A, k, v);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::matrix_diag_to_vector(A, k, v);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::matrix_diag_to_vector(A, k, v);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    template<typename NumericT>
+    void matrix_row(const matrix_base<NumericT> & A, unsigned int i, vector_base<NumericT> & v)
+    {
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::matrix_row(A, i, v);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::matrix_row(A, i, v);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::matrix_row(A, i, v);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    template<typename NumericT>
+    void matrix_column(const matrix_base<NumericT> & A, unsigned int j, vector_base<NumericT> & v)
+    {
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::matrix_column(A, j, v);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::matrix_column(A, j, v);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::matrix_column(A, j, v);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    /** @brief Computes the Frobenius norm of a matrix - dispatcher interface
+    *
+    * @param A      The matrix
+    * @param result The result scalar
+    *
+    * Note that if A is strided or off-set, then a copy will be created.
+    */
+    template<typename T>
+    void norm_frobenius_impl(matrix_base<T> const & A,
+                             scalar<T> & result)
+    {
+      typedef typename matrix_base<T>::handle_type  HandleType;
+
+      if ((A.start1() > 0) || (A.start2() > 0) || (A.stride1() > 1) || (A.stride2() > 1)) {
+        if (A.row_major()) {
+          viennacl::matrix<T, viennacl::row_major> temp_A(A);
+          viennacl::vector_base<T> temp(const_cast<HandleType &>(temp_A.handle()), temp_A.internal_size(), 0, 1);
+          norm_2_impl(temp, result);
+        } else {
+          viennacl::matrix<T, viennacl::column_major> temp_A(A);
+          viennacl::vector_base<T> temp(const_cast<HandleType &>(temp_A.handle()), temp_A.internal_size(), 0, 1);
+          norm_2_impl(temp, result);
+        }
+      } else {
+        viennacl::vector_base<T> temp(const_cast<HandleType &>(A.handle()), A.internal_size(), 0, 1);
+        norm_2_impl(temp, result);
+      }
+
+    }
+
+    /** @brief Computes the Frobenius norm of a vector with final reduction on the CPU
+    *
+    * @param A      The matrix
+    * @param result The result scalar
+    *
+    * Note that if A is strided or off-set, then a copy will be created.
+    */
+    template<typename T>
+    void norm_frobenius_cpu(matrix_base<T> const & A,
+                            T & result)
+    {
+      typedef typename matrix_base<T>::handle_type  HandleType;
+
+      if ((A.start1() > 0) || (A.start2() > 0) || (A.stride1() > 1) || (A.stride2() > 1)) {
+        if (A.row_major()) {
+          viennacl::matrix<T, viennacl::row_major> temp_A(A);
+          viennacl::vector_base<T> temp(const_cast<HandleType &>(temp_A.handle()), temp_A.internal_size(), 0, 1);
+          norm_2_cpu(temp, result);
+        } else {
+          viennacl::matrix<T, viennacl::column_major> temp_A(A);
+          viennacl::vector_base<T> temp(const_cast<HandleType &>(temp_A.handle()), temp_A.internal_size(), 0, 1);
+          norm_2_cpu(temp, result);
+        }
+      } else {
+        viennacl::vector_base<T> temp(const_cast<HandleType &>(A.handle()), A.internal_size(), 0, 1);
+        norm_2_cpu(temp, result);
+      }
+
+    }
+
+    //
+    /////////////////////////   matrix-vector products /////////////////////////////////
+    //
+
+
+
+    // A * x
+
+    /** @brief Carries out matrix-vector multiplication
+    *
+    * Implementation of the convenience expression result = prod(mat, vec);
+    *
+    * @param mat    The matrix
+    * @param vec    The vector
+    * @param result The result vector
+    */
+    template<typename NumericT>
+    void prod_impl(const matrix_base<NumericT> & mat,
+                   const vector_base<NumericT> & vec,
+                         vector_base<NumericT> & result)
+    {
+      assert( (viennacl::traits::size1(mat) == viennacl::traits::size(result)) && bool("Size check failed at v1 = prod(A, v2): size1(A) != size(v1)"));
+      assert( (viennacl::traits::size2(mat) == viennacl::traits::size(vec))    && bool("Size check failed at v1 = prod(A, v2): size2(A) != size(v2)"));
+
+      switch (viennacl::traits::handle(mat).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(mat, false, vec, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(mat, false, vec, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(mat, false, vec, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    // trans(A) * x
+
+    /** @brief Carries out matrix-vector multiplication with a transposed matrix
+    *
+    * Implementation of the convenience expression result = trans(mat) * vec;
+    *
+    * @param mat_trans  The transposed matrix proxy
+    * @param vec        The vector
+    * @param result     The result vector
+    */
+    template<typename NumericT>
+    void prod_impl(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & mat_trans,
+                   const vector_base<NumericT> & vec,
+                         vector_base<NumericT> & result)
+    {
+      assert( (viennacl::traits::size1(mat_trans.lhs()) == viennacl::traits::size(vec))    && bool("Size check failed at v1 = trans(A) * v2: size1(A) != size(v2)"));
+      assert( (viennacl::traits::size2(mat_trans.lhs()) == viennacl::traits::size(result)) && bool("Size check failed at v1 = trans(A) * v2: size2(A) != size(v1)"));
+
+      switch (viennacl::traits::handle(mat_trans.lhs()).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(mat_trans.lhs(), true, vec, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(mat_trans.lhs(), true, vec, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(mat_trans.lhs(), true, vec, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    //
+    /////////////////////////   matrix-matrix products /////////////////////////////////
+    //
+
+    /** @brief Carries out matrix-matrix multiplication
+    *
+    * Implementation of C = prod(A, B);
+    *
+    */
+    template<typename NumericT, typename ScalarType >
+    void prod_impl(const matrix_base<NumericT> & A,
+                   const matrix_base<NumericT> & B,
+                         matrix_base<NumericT> & C,
+                   ScalarType alpha,
+                   ScalarType beta)
+    {
+      assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size check failed at C = prod(A, B): size1(A) != size1(C)"));
+      assert( (viennacl::traits::size2(A) == viennacl::traits::size1(B)) && bool("Size check failed at C = prod(A, B): size2(A) != size1(B)"));
+      assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size check failed at C = prod(A, B): size2(B) != size2(C)"));
+
+
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(A, false, B, false, C, alpha, beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(A, false, B, false, C, alpha, beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(A, false, B, false, C, alpha, beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+
+    /** @brief Carries out matrix-matrix multiplication
+    *
+    * Implementation of C = prod(trans(A), B);
+    *
+    */
+    template<typename NumericT, typename ScalarType >
+    void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT>,
+                                                      const matrix_base<NumericT>,
+                                                      op_trans> & A,
+                   const matrix_base<NumericT> & B,
+                         matrix_base<NumericT> & C,
+                   ScalarType alpha,
+                   ScalarType beta)
+    {
+      assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C) && bool("Size check failed at C = prod(trans(A), B): size2(A) != size1(C)"));
+      assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B) && bool("Size check failed at C = prod(trans(A), B): size1(A) != size1(B)"));
+      assert(viennacl::traits::size2(B)       == viennacl::traits::size2(C) && bool("Size check failed at C = prod(trans(A), B): size2(B) != size2(C)"));
+
+      switch (viennacl::traits::handle(A.lhs()).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(A.lhs(), true, B, false, C, alpha, beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(A.lhs(), true, B, false, C, alpha, beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(A.lhs(), true, B, false, C, alpha, beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+
+
+    /** @brief Carries out matrix-matrix multiplication
+    *
+    * Implementation of C = prod(A, trans(B));
+    *
+    */
+    template<typename NumericT, typename ScalarType >
+    void prod_impl(const matrix_base<NumericT> & A,
+                   const viennacl::matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & B,
+                         matrix_base<NumericT> & C,
+                   ScalarType alpha,
+                   ScalarType beta)
+    {
+      assert(viennacl::traits::size1(A)       == viennacl::traits::size1(C)       && bool("Size check failed at C = prod(A, trans(B)): size1(A) != size1(C)"));
+      assert(viennacl::traits::size2(A)       == viennacl::traits::size2(B.lhs()) && bool("Size check failed at C = prod(A, trans(B)): size2(A) != size2(B)"));
+      assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C)       && bool("Size check failed at C = prod(A, trans(B)): size1(B) != size2(C)"));
+
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(A, false, B.lhs(), true, C, alpha, beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(A, false, B.lhs(), true, C, alpha, beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(A, false, B.lhs(), true, C, alpha, beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+
+    /** @brief Carries out matrix-matrix multiplication
+    *
+    * Implementation of C = prod(trans(A), trans(B));
+    *
+    */
+    template<typename NumericT, typename ScalarType >
+    void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & A,
+                   const viennacl::matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & B,
+                   matrix_base<NumericT> & C,
+                   ScalarType alpha,
+                   ScalarType beta)
+    {
+      assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)       && bool("Size check failed at C = prod(trans(A), trans(B)): size2(A) != size1(C)"));
+      assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()) && bool("Size check failed at C = prod(trans(A), trans(B)): size1(A) != size2(B)"));
+      assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C)       && bool("Size check failed at C = prod(trans(A), trans(B)): size1(B) != size2(C)"));
+
+      switch (viennacl::traits::handle(A.lhs()).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(A.lhs(), true, B.lhs(), true, C, alpha, beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(A.lhs(), true, B.lhs(), true, C, alpha, beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(A.lhs(), true, B.lhs(), true, C, alpha, beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    ///////////////////////// summation operations /////////////
+
+    template<typename NumericT>
+    void row_sum_impl(matrix_base<NumericT> const & A, vector_base<NumericT> & result)
+    {
+      viennacl::vector<NumericT> all_ones = viennacl::scalar_vector<NumericT>(A.size2(), NumericT(1), viennacl::traits::context(A));
+      viennacl::linalg::prod_impl(A, all_ones, result);
+    }
+
+    template<typename NumericT>
+    void column_sum_impl(matrix_base<NumericT> const & A, vector_base<NumericT> & result)
+    {
+      viennacl::vector<NumericT> all_ones = viennacl::scalar_vector<NumericT>(A.size1(), NumericT(1), viennacl::traits::context(A));
+      viennacl::linalg::prod_impl(matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans>(A, A), all_ones, result);
+    }
+
+    ///////////////////////// Elementwise operations /////////////
+
+
+
+    /** @brief Implementation of the element-wise operation A = B .* C and A = B ./ C for matrices (using MATLAB syntax). Don't use this function directly, use element_prod() and element_div().
+    *
+    * @param A      The result matrix (or -range, or -slice)
+    * @param proxy  The proxy object holding B, C, and the operation
+    */
+    template<typename T, typename OP>
+    void element_op(matrix_base<T> & A,
+                    matrix_expression<const matrix_base<T>, const matrix_base<T>, OP> const & proxy)
+    {
+      assert( (viennacl::traits::size1(A) == viennacl::traits::size1(proxy)) && bool("Size check failed at A = element_op(B): size1(A) != size1(B)"));
+      assert( (viennacl::traits::size2(A) == viennacl::traits::size2(proxy)) && bool("Size check failed at A = element_op(B): size2(A) != size2(B)"));
+
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::element_op(A, proxy);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::element_op(A, proxy);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::element_op(A, proxy);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+#define VIENNACL_MAKE_BINARY_OP(OPNAME)\
+    template<typename T>\
+    viennacl::matrix_expression<const matrix_base<T>, const matrix_base<T>, op_element_binary<op_##OPNAME> >\
+    element_##OPNAME(matrix_base<T> const & A, matrix_base<T> const & B)\
+    {\
+      return viennacl::matrix_expression<const matrix_base<T>, const matrix_base<T>, op_element_binary<op_##OPNAME> >(A, B);\
+    }\
+\
+    template<typename M1, typename M2, typename OP, typename T>\
+    viennacl::matrix_expression<const matrix_expression<const M1, const M2, OP>,\
+                                const matrix_base<T>,\
+                                op_element_binary<op_##OPNAME> >\
+    element_##OPNAME(matrix_expression<const M1, const M2, OP> const & proxy, matrix_base<T> const & B)\
+    {\
+      return viennacl::matrix_expression<const matrix_expression<const M1, const M2, OP>,\
+                                         const matrix_base<T>,\
+                                         op_element_binary<op_##OPNAME> >(proxy, B);\
+    }\
+\
+    template<typename T, typename M2, typename M3, typename OP>\
+    viennacl::matrix_expression<const matrix_base<T>,\
+                                const matrix_expression<const M2, const M3, OP>,\
+                                op_element_binary<op_##OPNAME> >\
+    element_##OPNAME(matrix_base<T> const & A, matrix_expression<const M2, const M3, OP> const & proxy)\
+    {\
+      return viennacl::matrix_expression<const matrix_base<T>,\
+                                         const matrix_expression<const M2, const M3, OP>,\
+                                         op_element_binary<op_##OPNAME> >(A, proxy);\
+    }\
+\
+    template<typename M1, typename M2, typename OP1,\
+              typename M3, typename M4, typename OP2>\
+    viennacl::matrix_expression<const matrix_expression<const M1, const M2, OP1>,\
+                                const matrix_expression<const M3, const M4, OP2>,\
+                                op_element_binary<op_##OPNAME> >\
+    element_##OPNAME(matrix_expression<const M1, const M2, OP1> const & proxy1,\
+                 matrix_expression<const M3, const M4, OP2> const & proxy2)\
+    {\
+      return viennacl::matrix_expression<const matrix_expression<const M1, const M2, OP1>,\
+                                         const matrix_expression<const M3, const M4, OP2>,\
+                                         op_element_binary<op_##OPNAME> >(proxy1, proxy2);\
+    }
+
+    VIENNACL_MAKE_BINARY_OP(prod)
+    VIENNACL_MAKE_BINARY_OP(div)
+    VIENNACL_MAKE_BINARY_OP(pow)
+
+    VIENNACL_MAKE_BINARY_OP(eq)
+    VIENNACL_MAKE_BINARY_OP(neq)
+    VIENNACL_MAKE_BINARY_OP(greater)
+    VIENNACL_MAKE_BINARY_OP(less)
+    VIENNACL_MAKE_BINARY_OP(geq)
+    VIENNACL_MAKE_BINARY_OP(leq)
+
+#undef VIENNACL_GENERATE_BINARY_OP_OVERLOADS
+
+
+
+#define VIENNACL_MAKE_UNARY_ELEMENT_OP(funcname) \
+    template<typename T> \
+    viennacl::matrix_expression<const matrix_base<T>, const matrix_base<T>, op_element_unary<op_##funcname> > \
+    element_##funcname(matrix_base<T> const & A) \
+    { \
+      return viennacl::matrix_expression<const matrix_base<T>, const matrix_base<T>, op_element_unary<op_##funcname> >(A, A); \
+    } \
+    template<typename LHS, typename RHS, typename OP> \
+    viennacl::matrix_expression<const matrix_expression<const LHS, const RHS, OP>, \
+                                const matrix_expression<const LHS, const RHS, OP>, \
+                                op_element_unary<op_##funcname> > \
+    element_##funcname(matrix_expression<const LHS, const RHS, OP> const & proxy) \
+    { \
+      return viennacl::matrix_expression<const matrix_expression<const LHS, const RHS, OP>, \
+                                         const matrix_expression<const LHS, const RHS, OP>, \
+                                         op_element_unary<op_##funcname> >(proxy, proxy); \
+    } \
+
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(abs)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(acos)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(asin)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(atan)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(ceil)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(cos)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(cosh)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(exp)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(fabs)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(floor)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(log)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(log10)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(sin)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(sinh)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(sqrt)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(tan)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(tanh)
+
+#undef VIENNACL_MAKE_UNARY_ELEMENT_OP
+
+
+    //
+    /////////////////////////   miscellaneous operations /////////////////////////////////
+    //
+
+
+    /** @brief Returns a proxy class for the operation mat += vec1 * vec2^T, i.e. a rank 1 update
+    *
+    * @param vec1    The first vector
+    * @param vec2    The second vector
+    */
+    template<typename NumericT>
+    viennacl::matrix_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_prod>
+    outer_prod(const vector_base<NumericT> & vec1, const vector_base<NumericT> & vec2)
+    {
+      return viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>(vec1, vec2);
+    }
+
+
+    /** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update
+    *
+    * Implementation of the convenience expression result += alpha * outer_prod(vec1, vec2);
+    *
+    * @param mat1             The matrix to be updated
+    * @param alpha            The scaling factor (either a viennacl::scalar<>, float, or double)
+    * @param len_alpha        Length of the buffer for an eventual final reduction step (currently always '1')
+    * @param reciprocal_alpha Use 1/alpha instead of alpha
+    * @param flip_sign_alpha  Use -alpha instead of alpha
+    * @param vec1             The first vector
+    * @param vec2             The second vector
+    */
+    template<typename NumericT, typename S1>
+    void scaled_rank_1_update(matrix_base<NumericT> & mat1,
+                              S1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                              const vector_base<NumericT> & vec1,
+                              const vector_base<NumericT> & vec2)
+    {
+      switch (viennacl::traits::handle(mat1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::scaled_rank_1_update(mat1,
+                                                             alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                                             vec1, vec2);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::scaled_rank_1_update(mat1,
+                                                         alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                                         vec1, vec2);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::scaled_rank_1_update(mat1,
+                                                       alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                                       vec1, vec2);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    /** @brief This function stores the diagonal and the superdiagonal of a matrix in two vectors.
+    *
+    *
+    * @param A     The matrix from which the vectors will be extracted of.
+    * @param dh    The vector in which the diagonal of the matrix will be stored in.
+    * @param sh    The vector in which the superdiagonal of the matrix will be stored in.
+    */
+
+    template <typename NumericT, typename VectorType>
+    void bidiag_pack(matrix_base<NumericT> & A,
+                     VectorType & dh,
+                     VectorType & sh
+                    )
+    {
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::bidiag_pack(A, dh, sh);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::bidiag_pack(A, dh, sh);
+          break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::bidiag_pack(A, dh, sh);
+          break;
+#endif
+
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+
+
+    }
+    /** @brief This function copies a row or a column from a matrix to a vector.
+    *
+    *
+    * @param A          The matrix where to copy from.
+    * @param V          The vector to fill with data.
+    * @param row_start  The number of the first row to copy.
+    * @param col_start  The number of the first column to copy.
+    * @param copy_col   Set to TRUE to copy a column, FALSE to copy a row.
+    */
+
+    template <typename SCALARTYPE>
+    void copy_vec(matrix_base<SCALARTYPE>& A,
+                  vector_base<SCALARTYPE>& V,
+                  vcl_size_t row_start,
+                  vcl_size_t col_start,
+                  bool copy_col
+    )
+    {
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::copy_vec(A, V, row_start, col_start, copy_col);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::copy_vec(A, V, row_start, col_start, copy_col);
+          break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::copy_vec(A, V, row_start, col_start, copy_col);
+          break;
+#endif
+
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+
+    }
+
+    /** @brief This function applies a householder transformation to a matrix. A <- P * A with a householder reflection P
+    *
+    * @param A       The matrix to be updated.
+    * @param D       The normalized householder vector.
+    * @param start   The repetition counter.
+    */
+  template <typename NumericT>
+  void house_update_A_left(matrix_base<NumericT> & A,
+                           vector_base<NumericT>    & D,
+                           vcl_size_t start)
+  {
+    switch (viennacl::traits::handle(A).get_active_handle_id())
+    {
+      case viennacl::MAIN_MEMORY:
+        viennacl::linalg::host_based::house_update_A_left(A, D, start);
+        break;
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::house_update_A_left(A, D, start);
+        break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::house_update_A_left(A, D, start);
+        break;
+#endif
+
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+
+  /** @brief This function applies a householder transformation to a matrix: A <- A * P with a householder reflection P
+  *
+  *
+  * @param A        The matrix to be updated.
+  * @param D        The normalized householder vector.
+  */
+
+  template <typename NumericT>
+  void house_update_A_right(matrix_base<NumericT>& A,
+                            vector_base<NumericT>   & D)
+  {
+    switch (viennacl::traits::handle(A).get_active_handle_id())
+    {
+      case viennacl::MAIN_MEMORY:
+        viennacl::linalg::host_based::house_update_A_right(A, D);
+        break;
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::house_update_A_right(A, D);
+        break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::house_update_A_right(A, D);
+        break;
+#endif
+
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+  /** @brief This function updates the matrix Q, which is needed for the computation of the eigenvectors.
+  *
+  * @param Q        The matrix to be updated.
+  * @param D        The householder vector.
+  * @param A_size1  size1 of matrix A
+  */
+
+  template <typename NumericT>
+  void house_update_QL(matrix_base<NumericT> & Q,
+                       vector_base<NumericT>    & D,
+                       vcl_size_t A_size1)
+  {
+    switch (viennacl::traits::handle(Q).get_active_handle_id())
+    {
+      case viennacl::MAIN_MEMORY:
+        viennacl::linalg::host_based::house_update_QL(Q, D, A_size1);
+        break;
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::house_update_QL(Q, D, A_size1);
+        break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::house_update_QL(Q, D, A_size1);
+        break;
+#endif
+
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+
+  /** @brief This function updates the matrix Q. It is part of the tql2 algorithm.
+  *
+  *
+  * @param Q       The matrix to be updated.
+  * @param tmp1    Vector with data from the tql2 algorithm.
+  * @param tmp2    Vector with data from the tql2 algorithm.
+  * @param l       Data from the tql2 algorithm.
+  * @param m       Data from the tql2 algorithm.
+  */
+  template<typename NumericT>
+  void givens_next(matrix_base<NumericT> & Q,
+                   vector_base<NumericT> & tmp1,
+                   vector_base<NumericT> & tmp2,
+                   int l,
+                   int m
+                )
+  {
+    switch (viennacl::traits::handle(Q).get_active_handle_id())
+    {
+      case viennacl::MAIN_MEMORY:
+        viennacl::linalg::host_based::givens_next(Q, tmp1, tmp2, l, m);
+        break;
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::givens_next(Q, tmp1, tmp2, l, m);
+        break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::givens_next(Q, tmp1, tmp2, l, m);
+        break;
+#endif
+
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+  } //namespace linalg
+
+
+
+
+  //
+  /////////////////////////  Operator overloads /////////////////////////////////
+  //
+
+
+  //v += A * x
+  /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
+  *
+  * @param v1     The result vector v1 where A * v2 is added to
+  * @param proxy  An expression template proxy class.
+  */
+  template<typename NumericT>
+  vector<NumericT>
+  operator+=(vector_base<NumericT> & v1,
+             const viennacl::vector_expression< const matrix_base<NumericT>, const vector_base<NumericT>, viennacl::op_prod> & proxy)
+  {
+    assert(viennacl::traits::size1(proxy.lhs()) == v1.size() && bool("Size check failed for v1 += A * v2: size1(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size1(proxy.lhs()));
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    v1 += result;
+    return v1;
+  }
+
+  /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
+  *
+  * @param v1     The result vector v1 where A * v2 is subtracted from
+  * @param proxy  An expression template proxy class.
+  */
+  template<typename NumericT>
+  vector<NumericT>
+  operator-=(vector_base<NumericT> & v1,
+             const viennacl::vector_expression< const matrix_base<NumericT>, const vector_base<NumericT>, viennacl::op_prod> & proxy)
+  {
+    assert(viennacl::traits::size1(proxy.lhs()) == v1.size() && bool("Size check failed for v1 -= A * v2: size1(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size1(proxy.lhs()));
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    v1 -= result;
+    return v1;
+  }
+
+
+
+
+
+  //free functions:
+  /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
+  *
+  * @param v1     The addend vector.
+  * @param proxy  An expression template proxy class.
+  */
+  template<typename NumericT>
+  viennacl::vector<NumericT>
+  operator+(const vector_base<NumericT> & v1,
+            const vector_expression< const matrix_base<NumericT>, const vector_base<NumericT>, op_prod> & proxy)
+  {
+    assert(viennacl::traits::size1(proxy.lhs()) == viennacl::traits::size(v1) && bool("Size check failed for v1 + A * v2: size1(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size(v1));
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    result += v1;
+    return result;
+  }
+
+  /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
+  *
+  * @param v1     The addend vector.
+  * @param proxy  An expression template proxy class.
+  */
+  template<typename NumericT>
+  viennacl::vector<NumericT>
+  operator-(const vector_base<NumericT> & v1,
+            const vector_expression< const matrix_base<NumericT>, const vector_base<NumericT>, op_prod> & proxy)
+  {
+    assert(viennacl::traits::size1(proxy.lhs()) == viennacl::traits::size(v1) && bool("Size check failed for v1 - A * v2: size1(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size(v1));
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    result = v1 - result;
+    return result;
+  }
+
+
+  ////////// transposed_matrix_proxy
+
+
+  //v += A^T * x
+  /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
+  *
+  * @param v1     The addend vector where the result is written to.
+  * @param proxy  An expression template proxy class.
+  */
+  template<typename NumericT>
+  vector<NumericT>
+  operator+=(vector_base<NumericT> & v1,
+             const vector_expression< const matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans>,
+                                                              const vector_base<NumericT>,
+                                                              op_prod> & proxy)
+  {
+    assert(viennacl::traits::size2(proxy.lhs()) == v1.size() && bool("Size check failed in v1 += trans(A) * v2: size2(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size2(proxy.lhs()));
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    v1 += result;
+    return v1;
+  }
+
+  //v -= A^T * x
+  /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
+  *
+  * @param v1     The addend vector where the result is written to.
+  * @param proxy  An expression template proxy class.
+  */
+  template<typename NumericT>
+  vector<NumericT>
+  operator-=(vector_base<NumericT> & v1,
+             const vector_expression< const matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans>,
+                                                              const vector_base<NumericT>,
+                                                              op_prod> & proxy)
+  {
+    assert(viennacl::traits::size2(proxy.lhs()) == v1.size() && bool("Size check failed in v1 += trans(A) * v2: size2(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size2(proxy.lhs()));
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    v1 -= result;
+    return v1;
+  }
+
+
+  //free functions:
+  /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
+  *
+  * @param v1     The addend vector.
+  * @param proxy  An expression template proxy class.
+  */
+  template<typename NumericT>
+  vector<NumericT>
+  operator+(const vector_base<NumericT> & v1,
+            const vector_expression< const matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans>,
+                                     const vector_base<NumericT>,
+                                     op_prod> & proxy)
+  {
+    assert(viennacl::traits::size2(proxy.lhs()) == viennacl::traits::size(v1) && bool("Size check failed in v1 + trans(A) * v2: size2(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size(v1));
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    result += v1;
+    return result;
+  }
+
+  /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
+  *
+  * @param v1     The addend vector.
+  * @param proxy  An expression template proxy class.
+  */
+  template<typename NumericT>
+  vector<NumericT>
+  operator-(const vector_base<NumericT> & v1,
+            const vector_expression< const matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans>,
+                                     const vector_base<NumericT>,
+                                     op_prod> & proxy)
+  {
+    assert(viennacl::traits::size2(proxy.lhs()) == viennacl::traits::size(v1) && bool("Size check failed in v1 - trans(A) * v2: size2(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size(v1));
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    result = v1 - result;
+    return result;
+  }
+
+
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/maxmin.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/maxmin.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/maxmin.hpp
new file mode 100644
index 0000000..9269598
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/maxmin.hpp
@@ -0,0 +1,152 @@
+#ifndef VIENNACL_LINALG_MAXMIN_HPP_
+#define VIENNACL_LINALG_MAXMIN_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file norm_inf.hpp
+    @brief Generic interface for the l^infty-norm. See viennacl/linalg/vector_operations.hpp for implementations.
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+#include "viennacl/meta/result_of.hpp"
+
+namespace viennacl
+{
+  //
+  // generic norm_inf function
+  //   uses tag dispatch to identify which algorithm
+  //   should be called
+  //
+  namespace linalg
+  {
+
+
+    // ----------------------------------------------------
+    // STL
+    //
+    template< typename NumericT >
+    NumericT max(std::vector<NumericT> const & v1)
+    {
+      //std::cout << "stl .. " << std::endl;
+      NumericT result = v1[0];
+      for (vcl_size_t i=1; i<v1.size(); ++i)
+      {
+        if (v1[i] > result)
+          result = v1[i];
+      }
+
+      return result;
+    }
+
+    // ----------------------------------------------------
+    // VIENNACL
+    //
+    template< typename ScalarType>
+    viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                 const viennacl::vector_base<ScalarType>,
+                                 viennacl::op_max >
+    max(viennacl::vector_base<ScalarType> const & v1)
+    {
+       //std::cout << "viennacl .. " << std::endl;
+      return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                          const viennacl::vector_base<ScalarType>,
+                                          viennacl::op_max >(v1, v1);
+    }
+
+    // with vector expression:
+    template<typename LHS, typename RHS, typename OP>
+    viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                viennacl::op_max>
+    max(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+    {
+      return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          viennacl::op_max >(vector, vector);
+    }
+
+    // ----------------------------------------------------
+    // STL
+    //
+    template< typename NumericT >
+    NumericT min(std::vector<NumericT> const & v1)
+    {
+      //std::cout << "stl .. " << std::endl;
+      NumericT result = v1[0];
+      for (vcl_size_t i=1; i<v1.size(); ++i)
+      {
+        if (v1[i] < result)
+          result = v1[i];
+      }
+
+      return result;
+    }
+
+    // ----------------------------------------------------
+    // VIENNACL
+    //
+    template< typename ScalarType>
+    viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                 const viennacl::vector_base<ScalarType>,
+                                 viennacl::op_min >
+    min(viennacl::vector_base<ScalarType> const & v1)
+    {
+       //std::cout << "viennacl .. " << std::endl;
+      return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                          const viennacl::vector_base<ScalarType>,
+                                          viennacl::op_min >(v1, v1);
+    }
+
+    template< typename ScalarType>
+    viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                 const viennacl::vector_base<ScalarType>,
+                                 viennacl::op_min >
+    min(viennacl::vector<ScalarType> const & v1)
+    {
+       //std::cout << "viennacl .. " << std::endl;
+      return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                          const viennacl::vector_base<ScalarType>,
+                                          viennacl::op_min >(v1, v1);
+    }
+
+    // with vector expression:
+    template<typename LHS, typename RHS, typename OP>
+    viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                viennacl::op_min>
+    min(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+    {
+      return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          viennacl::op_min >(vector, vector);
+    }
+
+
+
+  } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/misc_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/misc_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/misc_operations.hpp
new file mode 100644
index 0000000..208573f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/misc_operations.hpp
@@ -0,0 +1,94 @@
+#ifndef VIENNACL_LINALG_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/misc_operations.hpp
+    @brief Implementations of miscellaneous operations
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/host_based/misc_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/misc_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/misc_operations.hpp"
+#endif
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    namespace detail
+    {
+
+      template<typename ScalarType>
+      void level_scheduling_substitute(vector<ScalarType> & vec,
+                                  viennacl::backend::mem_handle const & row_index_array,
+                                  viennacl::backend::mem_handle const & row_buffer,
+                                  viennacl::backend::mem_handle const & col_buffer,
+                                  viennacl::backend::mem_handle const & element_buffer,
+                                  vcl_size_t num_rows
+                                  )
+      {
+        assert( viennacl::traits::handle(vec).get_active_handle_id() == row_index_array.get_active_handle_id() && bool("Incompatible memory domains"));
+        assert( viennacl::traits::handle(vec).get_active_handle_id() ==      row_buffer.get_active_handle_id() && bool("Incompatible memory domains"));
+        assert( viennacl::traits::handle(vec).get_active_handle_id() ==      col_buffer.get_active_handle_id() && bool("Incompatible memory domains"));
+        assert( viennacl::traits::handle(vec).get_active_handle_id() ==  element_buffer.get_active_handle_id() && bool("Incompatible memory domains"));
+
+        switch (viennacl::traits::handle(vec).get_active_handle_id())
+        {
+          case viennacl::MAIN_MEMORY:
+            viennacl::linalg::host_based::detail::level_scheduling_substitute(vec, row_index_array, row_buffer, col_buffer, element_buffer, num_rows);
+            break;
+#ifdef VIENNACL_WITH_OPENCL
+          case viennacl::OPENCL_MEMORY:
+            viennacl::linalg::opencl::detail::level_scheduling_substitute(vec, row_index_array, row_buffer, col_buffer, element_buffer, num_rows);
+            break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+          case viennacl::CUDA_MEMORY:
+            viennacl::linalg::cuda::detail::level_scheduling_substitute(vec, row_index_array, row_buffer, col_buffer, element_buffer, num_rows);
+            break;
+#endif
+          case viennacl::MEMORY_NOT_INITIALIZED:
+            throw memory_exception("not initialised!");
+          default:
+            throw memory_exception("not implemented");
+        }
+      }
+
+
+
+
+    } //namespace detail
+
+
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/mixed_precision_cg.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/mixed_precision_cg.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/mixed_precision_cg.hpp
new file mode 100644
index 0000000..78254b3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/mixed_precision_cg.hpp
@@ -0,0 +1,199 @@
+#ifndef VIENNACL_LINALG_MIXED_PRECISION_CG_HPP_
+#define VIENNACL_LINALG_MIXED_PRECISION_CG_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/mixed_precision_cg.hpp
+    @brief The conjugate gradient method using mixed precision is implemented here. Experimental.
+*/
+
+#include <vector>
+#include <map>
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/traits/clear.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/backend/memory.hpp"
+
+#include "viennacl/vector_proxy.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    /** @brief A tag for the conjugate gradient Used for supplying solver parameters and for dispatching the solve() function
+    */
+    class mixed_precision_cg_tag
+    {
+      public:
+        /** @brief The constructor
+        *
+        * @param tol              Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||)
+        * @param max_iterations   The maximum number of iterations
+        * @param inner_tol        Inner tolerance for the low-precision iterations
+        */
+        mixed_precision_cg_tag(double tol = 1e-8, unsigned int max_iterations = 300, float inner_tol = 1e-2f) : tol_(tol), iterations_(max_iterations), inner_tol_(inner_tol) {}
+
+        /** @brief Returns the relative tolerance */
+        double tolerance() const { return tol_; }
+        /** @brief Returns the relative tolerance */
+        float inner_tolerance() const { return inner_tol_; }
+        /** @brief Returns the maximum number of iterations */
+        unsigned int max_iterations() const { return iterations_; }
+
+        /** @brief Return the number of solver iterations: */
+        unsigned int iters() const { return iters_taken_; }
+        void iters(unsigned int i) const { iters_taken_ = i; }
+
+        /** @brief Returns the estimated relative error at the end of the solver run */
+        double error() const { return last_error_; }
+        /** @brief Sets the estimated relative error at the end of the solver run */
+        void error(double e) const { last_error_ = e; }
+
+
+      private:
+        double tol_;
+        unsigned int iterations_;
+        float inner_tol_;
+
+        //return values from solver
+        mutable unsigned int iters_taken_;
+        mutable double last_error_;
+    };
+
+
+    /** @brief Implementation of the conjugate gradient solver without preconditioner
+    *
+    * Following the algorithm in the book by Y. Saad "Iterative Methods for sparse linear systems"
+    *
+    * @param matrix     The system matrix
+    * @param rhs        The load vector
+    * @param tag        Solver configuration tag
+    * @return The result vector
+    */
+    template<typename MatrixType, typename VectorType>
+    VectorType solve(const MatrixType & matrix, VectorType const & rhs, mixed_precision_cg_tag const & tag)
+    {
+      //typedef typename VectorType::value_type      ScalarType;
+      typedef typename viennacl::result_of::cpu_value_type<VectorType>::type    CPU_ScalarType;
+
+      //std::cout << "Starting CG" << std::endl;
+      vcl_size_t problem_size = viennacl::traits::size(rhs);
+      VectorType result(rhs);
+      viennacl::traits::clear(result);
+
+      VectorType residual = rhs;
+
+      CPU_ScalarType ip_rr = viennacl::linalg::inner_prod(rhs, rhs);
+      CPU_ScalarType new_ip_rr = 0;
+      CPU_ScalarType norm_rhs_squared = ip_rr;
+
+      if (norm_rhs_squared <= 0) //solution is zero if RHS norm is zero
+        return result;
+
+      viennacl::vector<float> residual_low_precision(problem_size, viennacl::traits::context(rhs));
+      viennacl::vector<float> result_low_precision(problem_size, viennacl::traits::context(rhs));
+      viennacl::vector<float> p_low_precision(problem_size, viennacl::traits::context(rhs));
+      viennacl::vector<float> tmp_low_precision(problem_size, viennacl::traits::context(rhs));
+      float inner_ip_rr = static_cast<float>(ip_rr);
+      float new_inner_ip_rr = 0;
+      float initial_inner_rhs_norm_squared = static_cast<float>(ip_rr);
+      float alpha;
+      float beta;
+
+      // transfer rhs to single precision:
+      p_low_precision = rhs;
+      residual_low_precision = p_low_precision;
+
+      // transfer matrix to single precision:
+      viennacl::compressed_matrix<float> matrix_low_precision(matrix.size1(), matrix.size2(), matrix.nnz(), viennacl::traits::context(rhs));
+      viennacl::backend::memory_copy(matrix.handle1(), const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle1()), 0, 0, matrix_low_precision.handle1().raw_size() );
+      viennacl::backend::memory_copy(matrix.handle2(), const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle2()), 0, 0, matrix_low_precision.handle2().raw_size() );
+
+      viennacl::vector_base<CPU_ScalarType> matrix_elements_high_precision(const_cast<viennacl::backend::mem_handle &>(matrix.handle()), matrix.nnz(), 0, 1);
+      viennacl::vector_base<float>          matrix_elements_low_precision(matrix_low_precision.handle(), matrix.nnz(), 0, 1);
+      matrix_elements_low_precision = matrix_elements_high_precision;
+      matrix_low_precision.generate_row_block_information();
+
+      for (unsigned int i = 0; i < tag.max_iterations(); ++i)
+      {
+        tag.iters(i+1);
+
+        // lower precision 'inner iteration'
+        tmp_low_precision = viennacl::linalg::prod(matrix_low_precision, p_low_precision);
+
+        alpha = inner_ip_rr / viennacl::linalg::inner_prod(tmp_low_precision, p_low_precision);
+        result_low_precision += alpha * p_low_precision;
+        residual_low_precision -= alpha * tmp_low_precision;
+
+        new_inner_ip_rr = viennacl::linalg::inner_prod(residual_low_precision, residual_low_precision);
+
+        beta = new_inner_ip_rr / inner_ip_rr;
+        inner_ip_rr = new_inner_ip_rr;
+
+        p_low_precision = residual_low_precision + beta * p_low_precision;
+
+        //
+        // If enough progress has been achieved, update current residual with high precision evaluation
+        // This is effectively a restart of the CG method
+        //
+        if (new_inner_ip_rr < tag.inner_tolerance() * initial_inner_rhs_norm_squared || i == tag.max_iterations()-1)
+        {
+          residual = result_low_precision; // reusing residual vector as temporary buffer for conversion. Overwritten below anyway
+          result += residual;
+
+          // residual = b - Ax  (without introducing a temporary)
+          residual = viennacl::linalg::prod(matrix, result);
+          residual = rhs - residual;
+
+          new_ip_rr = viennacl::linalg::inner_prod(residual, residual);
+          if (new_ip_rr / norm_rhs_squared < tag.tolerance() *  tag.tolerance())//squared norms involved here
+            break;
+
+          p_low_precision = residual;
+
+          result_low_precision.clear();
+          residual_low_precision = p_low_precision;
+          initial_inner_rhs_norm_squared = static_cast<float>(new_ip_rr);
+          inner_ip_rr = static_cast<float>(new_ip_rr);
+        }
+      }
+
+      //store last error estimate:
+      tag.error(std::sqrt(new_ip_rr / norm_rhs_squared));
+
+      return result;
+    }
+
+    template<typename MatrixType, typename VectorType>
+    VectorType solve(const MatrixType & matrix, VectorType const & rhs, mixed_precision_cg_tag const & tag, viennacl::linalg::no_precond)
+    {
+      return solve(matrix, rhs, tag);
+    }
+
+
+  }
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/nmf.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/nmf.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/nmf.hpp
new file mode 100644
index 0000000..c962c8e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/nmf.hpp
@@ -0,0 +1,91 @@
+#ifndef VIENNACL_LINALG_NMF_HPP
+#define VIENNACL_LINALG_NMF_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+
+/** @file viennacl/linalg/nmf.hpp
+ @brief Provides a nonnegative matrix factorization implementation.  Experimental.
+
+
+ */
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_frobenius.hpp"
+
+#include "viennacl/linalg/host_based/nmf_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/linalg/opencl/kernels/nmf.hpp"
+#include "viennacl/linalg/opencl/nmf_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+#include "viennacl/linalg/cuda/nmf_operations.hpp"
+#endif
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    /** @brief The nonnegative matrix factorization (approximation) algorithm as suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into matrices W and H such that ||V - W*H|| is minimized.
+     *
+     * @param V     Input matrix
+     * @param W     First factor
+     * @param H     Second factor
+     * @param conf  A configuration object holding tolerances and the like
+     */
+    template<typename ScalarType>
+    void nmf(viennacl::matrix_base<ScalarType> const & V, viennacl::matrix_base<ScalarType> & W,
+        viennacl::matrix_base<ScalarType> & H, viennacl::linalg::nmf_config const & conf)
+    {
+      assert(V.size1() == W.size1() && V.size2() == H.size2() && bool("Dimensions of W and H don't allow for V = W * H"));
+      assert(W.size2() == H.size1() && bool("Dimensions of W and H don't match, prod(W, H) impossible"));
+
+      switch (viennacl::traits::handle(V).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::nmf(V, W, H, conf);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+          case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::nmf(V,W,H,conf);
+          break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+          case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::nmf(V,W,H,conf);
+          break;
+#endif
+
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+
+      }
+
+    }
+  }
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/norm_1.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/norm_1.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_1.hpp
new file mode 100644
index 0000000..e16238b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_1.hpp
@@ -0,0 +1,104 @@
+#ifndef VIENNACL_LINALG_NORM_1_HPP_
+#define VIENNACL_LINALG_NORM_1_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file norm_1.hpp
+    @brief Generic interface for the l^1-norm. See viennacl/linalg/vector_operations.hpp for implementations.
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+
+namespace viennacl
+{
+  //
+  // generic norm_1 function
+  //   uses tag dispatch to identify which algorithm
+  //   should be called
+  //
+  namespace linalg
+  {
+
+    #ifdef VIENNACL_WITH_UBLAS
+    // ----------------------------------------------------
+    // UBLAS
+    //
+    template< typename VectorT >
+    typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
+                                  typename VectorT::value_type
+                                >::type
+    norm_1(VectorT const& vector)
+    {
+      // std::cout << "ublas .. " << std::endl;
+      return boost::numeric::ublas::norm_1(vector);
+    }
+    #endif
+
+
+    // ----------------------------------------------------
+    // STL
+    //
+    template< typename T, typename A >
+    T norm_1(std::vector<T, A> const & v1)
+    {
+      //std::cout << "stl .. " << std::endl;
+      T result = 0;
+      for (typename std::vector<T, A>::size_type i=0; i<v1.size(); ++i)
+        result += std::fabs(v1[i]);
+
+      return result;
+    }
+
+    // ----------------------------------------------------
+    // VIENNACL
+    //
+    template< typename ScalarType>
+    viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                 const viennacl::vector_base<ScalarType>,
+                                 viennacl::op_norm_1 >
+    norm_1(viennacl::vector_base<ScalarType> const & vector)
+    {
+      return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                          const viennacl::vector_base<ScalarType>,
+                                          viennacl::op_norm_1 >(vector, vector);
+    }
+
+    // with vector expression:
+    template<typename LHS, typename RHS, typename OP>
+    viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                viennacl::op_norm_1>
+    norm_1(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+    {
+      return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          viennacl::op_norm_1 >(vector, vector);
+    }
+
+  } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/norm_2.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/norm_2.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_2.hpp
new file mode 100644
index 0000000..babb285
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_2.hpp
@@ -0,0 +1,140 @@
+#ifndef VIENNACL_LINALG_NORM_2_HPP_
+#define VIENNACL_LINALG_NORM_2_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file norm_2.hpp
+    @brief Generic interface for the l^2-norm. See viennacl/linalg/vector_operations.hpp for implementations.
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+
+namespace viennacl
+{
+  //
+  // generic norm_2 function
+  //   uses tag dispatch to identify which algorithm
+  //   should be called
+  //
+  namespace linalg
+  {
+    #ifdef VIENNACL_WITH_MTL4
+    // ----------------------------------------------------
+    // MTL4
+    //
+    template< typename VectorT >
+    typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< VectorT >::type >::value,
+                                  typename VectorT::value_type>::type
+    norm_2(VectorT const & v)
+    {
+      return mtl::two_norm(v);
+    }
+    #endif
+
+    #ifdef VIENNACL_WITH_ARMADILLO
+    // ----------------------------------------------------
+    // Armadillo
+    //
+    template<typename NumericT>
+    NumericT norm_2(arma::Col<NumericT> const& v)
+    {
+      return norm(v);
+    }
+    #endif
+
+    #ifdef VIENNACL_WITH_EIGEN
+    // ----------------------------------------------------
+    // EIGEN
+    //
+    template< typename VectorT >
+    typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< VectorT >::type >::value,
+                                  typename VectorT::RealScalar>::type
+    norm_2(VectorT const & v)
+    {
+      return v.norm();
+    }
+    #endif
+
+
+    #ifdef VIENNACL_WITH_UBLAS
+    // ----------------------------------------------------
+    // UBLAS
+    //
+    template< typename VectorT >
+    typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
+                                  typename VectorT::value_type>::type
+    norm_2(VectorT const & v)
+    {
+      return boost::numeric::ublas::norm_2(v);
+    }
+    #endif
+
+
+    // ----------------------------------------------------
+    // STL
+    //
+    template< typename T, typename A >
+    T norm_2(std::vector<T, A> const & v1)
+    {
+      T result = 0;
+      for (typename std::vector<T, A>::size_type i=0; i<v1.size(); ++i)
+        result += v1[i] * v1[i];
+
+      return std::sqrt(result);
+    }
+
+    // ----------------------------------------------------
+    // VIENNACL
+    //
+    template< typename ScalarType>
+    viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                 const viennacl::vector_base<ScalarType>,
+                                 viennacl::op_norm_2 >
+    norm_2(viennacl::vector_base<ScalarType> const & v)
+    {
+       //std::cout << "viennacl .. " << std::endl;
+      return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                          const viennacl::vector_base<ScalarType>,
+                                          viennacl::op_norm_2 >(v, v);
+    }
+
+    // with vector expression:
+    template<typename LHS, typename RHS, typename OP>
+    viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                viennacl::op_norm_2>
+    norm_2(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+    {
+      return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          viennacl::op_norm_2>(vector, vector);
+    }
+
+
+  } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/norm_frobenius.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/norm_frobenius.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_frobenius.hpp
new file mode 100644
index 0000000..6873a53
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_frobenius.hpp
@@ -0,0 +1,73 @@
+#ifndef VIENNACL_LINALG_NORM_FROBENIUS_HPP_
+#define VIENNACL_LINALG_NORM_FROBENIUS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/norm_frobenius.hpp
+    @brief Generic interface for the Frobenius norm.
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+
+namespace viennacl
+{
+  //
+  // generic norm_frobenius function
+  //   uses tag dispatch to identify which algorithm
+  //   should be called
+  //
+  namespace linalg
+  {
+
+    #ifdef VIENNACL_WITH_UBLAS
+    // ----------------------------------------------------
+    // UBLAS
+    //
+    template< typename VectorT >
+    typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
+                                  typename VectorT::value_type
+                                >::type
+    norm_frobenius(VectorT const& v1)
+    {
+      return boost::numeric::ublas::norm_frobenius(v1);
+    }
+    #endif
+
+
+    // ----------------------------------------------------
+    // VIENNACL
+    //
+    template<typename NumericT>
+    scalar_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_norm_frobenius>
+    norm_frobenius(const matrix_base<NumericT> & A)
+    {
+      return scalar_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_norm_frobenius>(A, A);
+    }
+
+  } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+


[22/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
new file mode 100644
index 0000000..1d212c2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
@@ -0,0 +1,307 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP
+#define VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/direct_solve.hpp
+    @brief Implementations of dense direct triangular solvers are found here.
+*/
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+namespace detail
+{
+  //
+  // Upper solve:
+  //
+  template<typename MatrixT1, typename MatrixT2>
+  void upper_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal)
+  {
+    typedef typename MatrixT2::value_type   value_type;
+
+    for (vcl_size_t i = 0; i < A_size; ++i)
+    {
+      vcl_size_t current_row = A_size - i - 1;
+
+      for (vcl_size_t j = current_row + 1; j < A_size; ++j)
+      {
+        value_type A_element = A(current_row, j);
+        for (vcl_size_t k=0; k < B_size; ++k)
+          B(current_row, k) -= A_element * B(j, k);
+      }
+
+      if (!unit_diagonal)
+      {
+        value_type A_diag = A(current_row, current_row);
+        for (vcl_size_t k=0; k < B_size; ++k)
+          B(current_row, k) /= A_diag;
+      }
+    }
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_upper_tag)
+  {
+    upper_inplace_solve_matrix(A, B, A_size, B_size, true);
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::upper_tag)
+  {
+    upper_inplace_solve_matrix(A, B, A_size, B_size, false);
+  }
+
+  //
+  // Lower solve:
+  //
+  template<typename MatrixT1, typename MatrixT2>
+  void lower_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal)
+  {
+    typedef typename MatrixT2::value_type   value_type;
+
+    for (vcl_size_t i = 0; i < A_size; ++i)
+    {
+      for (vcl_size_t j = 0; j < i; ++j)
+      {
+        value_type A_element = A(i, j);
+        for (vcl_size_t k=0; k < B_size; ++k)
+          B(i, k) -= A_element * B(j, k);
+      }
+
+      if (!unit_diagonal)
+      {
+        value_type A_diag = A(i, i);
+        for (vcl_size_t k=0; k < B_size; ++k)
+          B(i, k) /= A_diag;
+      }
+    }
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_lower_tag)
+  {
+    lower_inplace_solve_matrix(A, B, A_size, B_size, true);
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::lower_tag)
+  {
+    lower_inplace_solve_matrix(A, B, A_size, B_size, false);
+  }
+
+}
+
+//
+// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
+//
+
+////////////////// upper triangular solver (upper_tag) //////////////////////////////////////
+/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B   (MATLAB notation)
+*
+* @param A        The system matrix
+* @param B        The matrix of row vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & A,
+                   matrix_base<NumericT> & B,
+                   SolverTagT)
+{
+  typedef NumericT        value_type;
+
+  value_type const * data_A = detail::extract_raw_pointer<value_type>(A);
+  value_type       * data_B = detail::extract_raw_pointer<value_type>(B);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(A);
+  vcl_size_t A_start2 = viennacl::traits::start2(A);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(A);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(A);
+  //vcl_size_t A_size1  = viennacl::traits::size1(A);
+  vcl_size_t A_size2  = viennacl::traits::size2(A);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A);
+
+  vcl_size_t B_start1 = viennacl::traits::start1(B);
+  vcl_size_t B_start2 = viennacl::traits::start2(B);
+  vcl_size_t B_inc1   = viennacl::traits::stride1(B);
+  vcl_size_t B_inc2   = viennacl::traits::stride2(B);
+  //vcl_size_t B_size1  = viennacl::traits::size1(B);
+  vcl_size_t B_size2  = viennacl::traits::size2(B);
+  vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(B);
+  vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(B);
+
+
+  if (A.row_major() && B.row_major())
+  {
+    detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+    detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
+  }
+  else if (A.row_major() && !B.row_major())
+  {
+    detail::matrix_array_wrapper<value_type const, row_major,    false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+    detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
+  }
+  else if (!A.row_major() && B.row_major())
+  {
+    detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type,       row_major,    false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+    detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+    detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
+  }
+}
+
+
+//
+//  Solve on vector
+//
+
+namespace detail
+{
+  //
+  // Upper solve:
+  //
+  template<typename MatrixT, typename VectorT>
+  void upper_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal)
+  {
+    typedef typename VectorT::value_type   value_type;
+
+    for (vcl_size_t i = 0; i < A_size; ++i)
+    {
+      vcl_size_t current_row = A_size - i - 1;
+
+      for (vcl_size_t j = current_row + 1; j < A_size; ++j)
+      {
+        value_type A_element = A(current_row, j);
+        b(current_row) -= A_element * b(j);
+      }
+
+      if (!unit_diagonal)
+        b(current_row) /= A(current_row, current_row);
+    }
+  }
+
+  template<typename MatrixT, typename VectorT>
+  void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_upper_tag)
+  {
+    upper_inplace_solve_vector(A, b, A_size, true);
+  }
+
+  template<typename MatrixT, typename VectorT>
+  void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::upper_tag)
+  {
+    upper_inplace_solve_vector(A, b, A_size, false);
+  }
+
+  //
+  // Lower solve:
+  //
+  template<typename MatrixT, typename VectorT>
+  void lower_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal)
+  {
+    typedef typename VectorT::value_type   value_type;
+
+    for (vcl_size_t i = 0; i < A_size; ++i)
+    {
+      for (vcl_size_t j = 0; j < i; ++j)
+      {
+        value_type A_element = A(i, j);
+        b(i) -= A_element * b(j);
+      }
+
+      if (!unit_diagonal)
+        b(i) /= A(i, i);
+    }
+  }
+
+  template<typename MatrixT, typename VectorT>
+  void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_lower_tag)
+  {
+    lower_inplace_solve_vector(A, b, A_size, true);
+  }
+
+  template<typename MatrixT, typename VectorT>
+  void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::lower_tag)
+  {
+    lower_inplace_solve_vector(A, b, A_size, false);
+  }
+
+}
+
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & mat,
+                   vector_base<NumericT> & vec,
+                   SolverTagT)
+{
+  typedef NumericT        value_type;
+
+  value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+  value_type       * data_v = detail::extract_raw_pointer<value_type>(vec);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+  vcl_size_t A_size2  = viennacl::traits::size2(mat);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+  vcl_size_t start1 = viennacl::traits::start(vec);
+  vcl_size_t inc1   = viennacl::traits::stride(vec);
+
+  if (mat.row_major())
+  {
+    detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1);
+
+    detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT());
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1);
+
+    detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT());
+  }
+}
+
+
+} // namespace host_based
+} // namespace linalg
+} // namespace viennacl
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
new file mode 100644
index 0000000..f53f8f2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
@@ -0,0 +1,856 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/host_based/fft_operations.hpp
+    @brief Implementations of Fast Furier Transformation using a plain single-threaded or OpenMP-enabled execution on CPU
+ */
+
+//TODO openom Conditions
+#include <viennacl/vector.hpp>
+#include <viennacl/matrix.hpp>
+
+#include "viennacl/linalg/host_based/vector_operations.hpp"
+
+#include <stdexcept>
+#include <cmath>
+#include <complex>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+namespace detail
+{
+  namespace fft
+  {
+    const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
+
+    namespace FFT_DATA_ORDER
+    {
+      enum DATA_ORDER
+      {
+        ROW_MAJOR, COL_MAJOR
+      };
+    }
+
+    inline vcl_size_t num_bits(vcl_size_t size)
+    {
+      vcl_size_t bits_datasize = 0;
+      vcl_size_t ds = 1;
+
+      while (ds < size)
+      {
+        ds = ds << 1;
+        bits_datasize++;
+      }
+
+      return bits_datasize;
+    }
+
+    inline vcl_size_t next_power_2(vcl_size_t n)
+    {
+      n = n - 1;
+
+      vcl_size_t power = 1;
+
+      while (power < sizeof(vcl_size_t) * 8)
+      {
+        n = n | (n >> power);
+        power *= 2;
+      }
+
+      return n + 1;
+    }
+
+    inline vcl_size_t get_reorder_num(vcl_size_t v, vcl_size_t bit_size)
+    {
+      v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+      v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+      v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+      v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+      v = (v >> 16) | (v << 16);
+      v = v >> (32 - bit_size);
+      return v;
+    }
+
+    template<typename NumericT, unsigned int AlignmentV>
+    void copy_to_complex_array(std::complex<NumericT> * input_complex,
+                               viennacl::vector<NumericT, AlignmentV> const & in, vcl_size_t size)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size * 2); i2 += 2)
+      { //change array to complex array
+        vcl_size_t i = vcl_size_t(i2);
+        input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
+      }
+    }
+
+    template<typename NumericT>
+    void copy_to_complex_array(std::complex<NumericT> * input_complex,
+                               viennacl::vector_base<NumericT> const & in, vcl_size_t size)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size * 2); i2 += 2)
+      { //change array to complex array
+        vcl_size_t i = vcl_size_t(i2);
+        input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
+      }
+    }
+
+    template<typename NumericT, unsigned int AlignmentV>
+    void copy_to_vector(std::complex<NumericT> * input_complex,
+                        viennacl::vector<NumericT, AlignmentV> & in, vcl_size_t size)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size); i2++)
+      {
+        vcl_size_t i = vcl_size_t(i2);
+        in(i * 2)     = static_cast<NumericT>(std::real(input_complex[i]));
+        in(i * 2 + 1) = static_cast<NumericT>(std::imag(input_complex[i]));
+      }
+    }
+
+    template<typename NumericT>
+    void copy_to_complex_array(std::complex<NumericT> * input_complex,
+                               NumericT const * in, vcl_size_t size)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size * 2); i2 += 2)
+      { //change array to complex array
+        vcl_size_t i = vcl_size_t(i2);
+        input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
+      }
+    }
+
+    template<typename NumericT>
+    void copy_to_vector(std::complex<NumericT> * input_complex, NumericT * in, vcl_size_t size)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size); i2++)
+      {
+        vcl_size_t i = vcl_size_t(i2);
+        in[i * 2]     = static_cast<NumericT>(std::real(input_complex[i]));
+        in[i * 2 + 1] = static_cast<NumericT>(std::imag(input_complex[i]));
+      }
+    }
+
+    template<typename NumericT>
+    void copy_to_vector(std::complex<NumericT> * input_complex,
+                        viennacl::vector_base<NumericT> & in, vcl_size_t size)
+    {
+      std::vector<NumericT> temp(2 * size);
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size); i2++)
+      {
+        vcl_size_t i = vcl_size_t(i2);
+        temp[i * 2]     = static_cast<NumericT>(std::real(input_complex[i]));
+        temp[i * 2 + 1] = static_cast<NumericT>(std::imag(input_complex[i]));
+      }
+      viennacl::copy(temp, in);
+    }
+
+    template<typename NumericT>
+    void zero2(NumericT *input1, NumericT *input2, vcl_size_t size)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size); i2++)
+      {
+        vcl_size_t i = vcl_size_t(i2);
+        input1[i] = 0;
+        input2[i] = 0;
+      }
+    }
+
+  } //namespace fft
+
+} //namespace detail
+
+/**
+ * @brief Direct algoritm kenrnel
+ */
+template<typename NumericT>
+void fft_direct(std::complex<NumericT> * input_complex, std::complex<NumericT> * output,
+                vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign,
+                viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  NumericT const NUM_PI = NumericT(3.14159265358979323846);
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel
+#endif
+  for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
+  {
+    vcl_size_t batch_id = vcl_size_t(batch_id2);
+    for (vcl_size_t k = 0; k < size; k += 1)
+    {
+      std::complex<NumericT> f = 0;
+      for (vcl_size_t n = 0; n < size; n++)
+      {
+        std::complex<NumericT> input;
+        if (!data_order)
+          input = input_complex[batch_id * stride + n]; //input index here
+        else
+          input = input_complex[n * stride + batch_id];
+        NumericT arg = sign * 2 * NUM_PI * NumericT(k) / NumericT(size * n);
+        NumericT sn  = std::sin(arg);
+        NumericT cs  = std::cos(arg);
+
+        std::complex<NumericT> ex(cs, sn);
+        std::complex<NumericT> tmp(input.real() * ex.real() - input.imag() * ex.imag(),
+                                   input.real() * ex.imag() + input.imag() * ex.real());
+        f = f + tmp;
+      }
+      if (!data_order)
+        output[batch_id * stride + k] = f;   // output index here
+      else
+        output[k * stride + batch_id] = f;
+    }
+  }
+
+}
+
+/**
+ * @brief Direct 1D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::vector<NumericT, AlignmentV> const & in,
+            viennacl::vector<NumericT, AlignmentV>       & out,
+            vcl_size_t size, vcl_size_t stride,
+            vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  std::vector<std::complex<NumericT> > input_complex(size * batch_num);
+  std::vector<std::complex<NumericT> > output(size * batch_num);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
+
+  fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], out, size * batch_num);
+}
+
+/**
+ * @brief Direct 2D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in,
+            viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>       & out, vcl_size_t size,
+            vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  vcl_size_t row_num = in.internal_size1();
+  vcl_size_t col_num = in.internal_size2() >> 1;
+
+  vcl_size_t size_mat = row_num * col_num;
+
+  std::vector<std::complex<NumericT> > input_complex(size_mat);
+  std::vector<std::complex<NumericT> > output(size_mat);
+
+  NumericT const * data_A = detail::extract_raw_pointer<NumericT>(in);
+  NumericT       * data_B = detail::extract_raw_pointer<NumericT>(out);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size_mat);
+
+  fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], data_B, size_mat);
+}
+
+/*
+ * This function performs reorder of 1D input  data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void reorder(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
+             vcl_size_t bits_datasize, vcl_size_t batch_num,
+             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  std::vector<std::complex<NumericT> > input(size * batch_num);
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], in, size * batch_num);
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
+  {
+    vcl_size_t batch_id = vcl_size_t(batch_id2);
+    for (vcl_size_t i = 0; i < size; i++)
+    {
+      vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize);
+      if (i < v)
+      {
+        if (!data_order)
+        {
+          std::complex<NumericT> tmp   = input[batch_id * stride + i]; // index
+          input[batch_id * stride + i] = input[batch_id * stride + v]; //index
+          input[batch_id * stride + v] = tmp;      //index
+        }
+        else
+        {
+          std::complex<NumericT> tmp   = input[i * stride + batch_id]; // index
+          input[i * stride + batch_id] = input[v * stride + batch_id]; //index
+          input[v * stride + batch_id] = tmp;      //index
+        }
+      }
+    }
+  }
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], in, size * batch_num);
+}
+
+/*
+ * This function performs reorder of 2D input  data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void reorder(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in,
+             vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num,
+             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+  NumericT * data = detail::extract_raw_pointer<NumericT>(in);
+  vcl_size_t row_num = in.internal_size1();
+  vcl_size_t col_num = in.internal_size2() >> 1;
+  vcl_size_t size_mat = row_num * col_num;
+
+  std::vector<std::complex<NumericT> > input(size_mat);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], data, size_mat);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
+  {
+    vcl_size_t batch_id = vcl_size_t(batch_id2);
+    for (vcl_size_t i = 0; i < size; i++)
+    {
+      vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize);
+      if (i < v)
+      {
+        if (!data_order)
+        {
+          std::complex<NumericT> tmp   = input[batch_id * stride + i]; // index
+          input[batch_id * stride + i] = input[batch_id * stride + v]; //index
+          input[batch_id * stride + v] = tmp;      //index
+        } else
+        {
+          std::complex<NumericT> tmp   = input[i * stride + batch_id]; // index
+          input[i * stride + batch_id] = input[v * stride + batch_id]; //index
+          input[v * stride + batch_id] = tmp;      //index
+        }
+      }
+    }
+  }
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], data, size_mat);
+}
+
+/**
+ * @brief Radix-2 algorithm for computing Fourier transformation.
+ * Kernel for computing smaller amount of data
+ */
+template<typename NumericT>
+void fft_radix2(std::complex<NumericT> * input_complex, vcl_size_t batch_num,
+                vcl_size_t bit_size, vcl_size_t size, vcl_size_t stride, NumericT sign,
+                viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  NumericT const NUM_PI = NumericT(3.14159265358979323846);
+
+  for (vcl_size_t step = 0; step < bit_size; step++)
+  {
+    vcl_size_t ss = 1 << step;
+    vcl_size_t half_size = size >> 1;
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
+    {
+      vcl_size_t batch_id = vcl_size_t(batch_id2);
+      for (vcl_size_t tid = 0; tid < half_size; tid++)
+      {
+        vcl_size_t group = (tid & (ss - 1));
+        vcl_size_t pos = ((tid >> step) << (step + 1)) + group;
+        std::complex<NumericT> in1;
+        std::complex<NumericT> in2;
+        vcl_size_t offset;
+        if (!data_order)
+        {
+          offset = batch_id * stride + pos;
+          in1 = input_complex[offset];
+          in2 = input_complex[offset + ss];
+        }
+        else
+        {
+          offset = pos * stride + batch_id;
+          in1 = input_complex[offset];
+          in2 = input_complex[offset + ss * stride];
+        }
+        NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss);
+        NumericT sn = std::sin(arg);
+        NumericT cs = std::cos(arg);
+        std::complex<NumericT> ex(cs, sn);
+        std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
+                                   in2.real() * ex.imag() + in2.imag() * ex.real());
+        if (!data_order)
+          input_complex[offset + ss] = in1 - tmp;
+        else
+          input_complex[offset + ss * stride] = in1 - tmp;
+        input_complex[offset] = in1 + tmp;
+      }
+    }
+  }
+
+}
+
+/**
+ * @brief Radix-2 algorithm for computing Fourier transformation.
+ * Kernel for computing bigger amount of data
+ */
+template<typename NumericT>
+void fft_radix2_local(std::complex<NumericT> * input_complex,
+                      std::complex<NumericT> * lcl_input, vcl_size_t batch_num, vcl_size_t bit_size,
+                      vcl_size_t size, vcl_size_t stride, NumericT sign,
+                      viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  NumericT const NUM_PI = NumericT(3.14159265358979323846);
+
+  for (vcl_size_t batch_id = 0; batch_id < batch_num; batch_id++)
+  {
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long p2 = 0; p2 < long(size); p2 += 1)
+    {
+      vcl_size_t p = vcl_size_t(p2);
+      vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(p, bit_size);
+
+      if (!data_order)
+        lcl_input[v] = input_complex[batch_id * stride + p]; //index
+      else
+        lcl_input[v] = input_complex[p * stride + batch_id];
+    }
+
+    for (vcl_size_t s = 0; s < bit_size; s++)
+    {
+      vcl_size_t ss = 1 << s;
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for
+#endif
+      for (long tid2 = 0; tid2 < long(size)/2; tid2++)
+      {
+        vcl_size_t tid = vcl_size_t(tid2);
+        vcl_size_t group = (tid & (ss - 1));
+        vcl_size_t pos = ((tid >> s) << (s + 1)) + group;
+
+        std::complex<NumericT> in1 = lcl_input[pos];
+        std::complex<NumericT> in2 = lcl_input[pos + ss];
+
+        NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss);
+
+        NumericT sn = std::sin(arg);
+        NumericT cs = std::cos(arg);
+        std::complex<NumericT> ex(cs, sn);
+
+        std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
+                                   in2.real() * ex.imag() + in2.imag() * ex.real());
+
+        lcl_input[pos + ss] = in1 - tmp;
+        lcl_input[pos] = in1 + tmp;
+      }
+
+    }
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    //copy local array back to global memory
+    for (long p2 = 0; p2 < long(size); p2 += 1)
+    {
+      vcl_size_t p = vcl_size_t(p2);
+      if (!data_order)
+        input_complex[batch_id * stride + p] = lcl_input[p];
+      else
+        input_complex[p * stride + batch_id] = lcl_input[p];
+
+    }
+
+  }
+
+}
+
+/**
+ * @brief Radix-2 1D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
+            vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+  vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size);
+
+  std::vector<std::complex<NumericT> > input_complex(size * batch_num);
+  std::vector<std::complex<NumericT> > lcl_input(size * batch_num);
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
+
+  if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM)
+  {
+    viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order);
+  }
+  else
+  {
+    viennacl::linalg::host_based::reorder<NumericT>(in, size, stride, bit_size, batch_num, data_order);
+    viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
+    viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order);
+  }
+
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size * batch_num);
+}
+
+/**
+ * @brief Radix-2 2D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in, vcl_size_t size,
+            vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+  vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size);
+
+  NumericT * data = detail::extract_raw_pointer<NumericT>(in);
+
+  vcl_size_t row_num = in.internal_size1();
+  vcl_size_t col_num = in.internal_size2() >> 1;
+  vcl_size_t size_mat = row_num * col_num;
+
+  std::vector<std::complex<NumericT> > input_complex(size_mat);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat);
+  if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM)
+  {
+    //std::cout<<bit_size<<","<<size<<","<<stride<<","<<batch_num<<","<<size<<","<<sign<<","<<data_order<<std::endl;
+    std::vector<std::complex<NumericT> > lcl_input(size_mat);
+    viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order);
+  }
+  else
+  {
+    viennacl::linalg::host_based::reorder<NumericT>(in, size, stride, bit_size, batch_num, data_order);
+    viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat);
+    viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order);
+  }
+
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size_mat);
+
+}
+
+/**
+ * @brief Bluestein's algorithm for computing Fourier transformation.
+ *
+ * Currently,  Works only for sizes of input data which less than 2^16.
+ * Uses a lot of additional memory, but should be fast for any size of data.
+ * Serial implementation has something about o(n * lg n) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void bluestein(viennacl::vector<NumericT, AlignmentV>& in, viennacl::vector<NumericT, AlignmentV>& out, vcl_size_t /*batch_num*/)
+{
+
+  vcl_size_t size = in.size() >> 1;
+  vcl_size_t ext_size = viennacl::linalg::host_based::detail::fft::next_power_2(2 * size - 1);
+
+  viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
+  viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
+  viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
+
+  std::vector<std::complex<NumericT> > input_complex(size);
+  std::vector<std::complex<NumericT> > output_complex(size);
+
+  std::vector<std::complex<NumericT> > A_complex(ext_size);
+  std::vector<std::complex<NumericT> > B_complex(ext_size);
+  std::vector<std::complex<NumericT> > Z_complex(ext_size);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size);
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < long(ext_size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    A_complex[i] = 0;
+    B_complex[i] = 0;
+  }
+
+  vcl_size_t double_size = size << 1;
+
+  NumericT const NUM_PI = NumericT(3.14159265358979323846);
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    vcl_size_t rm = i * i % (double_size);
+    NumericT angle = NumericT(rm) / NumericT(size) * NumericT(NUM_PI);
+
+    NumericT sn_a = std::sin(-angle);
+    NumericT cs_a = std::cos(-angle);
+
+    std::complex<NumericT> a_i(cs_a, sn_a);
+    std::complex<NumericT> b_i(cs_a, -sn_a);
+
+    A_complex[i] = std::complex<NumericT>(input_complex[i].real() * a_i.real() - input_complex[i].imag() * a_i.imag(),
+                                          input_complex[i].real() * a_i.imag() + input_complex[i].imag() * a_i.real());
+    B_complex[i] = b_i;
+
+    // very bad instruction, to be fixed
+    if (i)
+      B_complex[ext_size - i] = b_i;
+  }
+
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size);
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&A_complex[0], A, ext_size);
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&B_complex[0], B, ext_size);
+
+  viennacl::linalg::convolve_i(A, B, Z);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&Z_complex[0], Z, ext_size);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    vcl_size_t rm = i * i % (double_size);
+    NumericT angle = NumericT(rm) / NumericT(size) * NumericT(-NUM_PI);
+    NumericT sn_a = std::sin(angle);
+    NumericT cs_a = std::cos(angle);
+    std::complex<NumericT> b_i(cs_a, sn_a);
+    output_complex[i] = std::complex<NumericT>(Z_complex[i].real() * b_i.real() - Z_complex[i].imag() * b_i.imag(),
+                                               Z_complex[i].real() * b_i.imag() + Z_complex[i].imag() * b_i.real());
+  }
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], out, size);
+
+}
+
+/**
+ * @brief Normalize vector with his own size
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void normalize(viennacl::vector<NumericT, AlignmentV> & input)
+{
+  vcl_size_t size = input.size() >> 1;
+  NumericT norm_factor = static_cast<NumericT>(size);
+  for (vcl_size_t i = 0; i < size * 2; i++)
+    input[i] /= norm_factor;
+
+}
+
+/**
+ * @brief Complex multiplikation of two vectors
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
+                      viennacl::vector<NumericT, AlignmentV> const & input2,
+                      viennacl::vector<NumericT, AlignmentV> & output)
+{
+  vcl_size_t size = input1.size() >> 1;
+
+  std::vector<std::complex<NumericT> > input1_complex(size);
+  std::vector<std::complex<NumericT> > input2_complex(size);
+  std::vector<std::complex<NumericT> > output_complex(size);
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input1_complex[0], input1, size);
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input2_complex[0], input2, size);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    std::complex<NumericT> in1 = input1_complex[i];
+    std::complex<NumericT> in2 = input2_complex[i];
+    output_complex[i] = std::complex<NumericT>(in1.real() * in2.real() - in1.imag() * in2.imag(),
+                                               in1.real() * in2.imag() + in1.imag() * in2.real());
+  }
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], output, size);
+
+}
+/**
+ * @brief Inplace transpose of matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
+{
+  vcl_size_t row_num = input.internal_size1() / 2;
+  vcl_size_t col_num = input.internal_size2() / 2;
+
+  vcl_size_t size = row_num * col_num;
+
+  NumericT * data = detail::extract_raw_pointer<NumericT>(input);
+
+  std::vector<std::complex<NumericT> > input_complex(size);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size);
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    vcl_size_t row = i / col_num;
+    vcl_size_t col = i - row * col_num;
+    vcl_size_t new_pos = col * row_num + row;
+
+    if (i < new_pos)
+    {
+      std::complex<NumericT> val = input_complex[i];
+      input_complex[i] = input_complex[new_pos];
+      input_complex[new_pos] = val;
+    }
+  }
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size);
+
+}
+
+/**
+ * @brief Transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
+               viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>       & output)
+{
+
+  vcl_size_t row_num = input.internal_size1() / 2;
+  vcl_size_t col_num = input.internal_size2() / 2;
+  vcl_size_t size = row_num * col_num;
+
+  NumericT const * data_A = detail::extract_raw_pointer<NumericT>(input);
+  NumericT       * data_B = detail::extract_raw_pointer<NumericT>(output);
+
+  std::vector<std::complex<NumericT> > input_complex(size);
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size);
+
+  std::vector<std::complex<NumericT> > output_complex(size);
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    vcl_size_t row = i / col_num;
+    vcl_size_t col = i % col_num;
+    vcl_size_t new_pos = col * row_num + row;
+    output_complex[new_pos] = input_complex[i];
+  }
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], data_B, size);
+}
+
+/**
+ * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void real_to_complex(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT>       & out, vcl_size_t size)
+{
+  NumericT const * data_in  = detail::extract_raw_pointer<NumericT>(in);
+  NumericT       * data_out = detail::extract_raw_pointer<NumericT>(out);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = static_cast<vcl_size_t>(i2);
+    data_out[2*i  ] = data_in[i];
+    data_out[2*i+1] = NumericT(0);
+  }
+}
+
+/**
+ * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void complex_to_real(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT>       & out, vcl_size_t size)
+{
+  NumericT const * data_in  = detail::extract_raw_pointer<NumericT>(in);
+  NumericT       * data_out = detail::extract_raw_pointer<NumericT>(out);
+
+#ifdef VIENNACL_WITH_OPENMP
+#pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < long(size); i++)
+    data_out[i] = data_in[2*i];
+}
+
+/**
+ * @brief Reverse vector to opposite order and save it in input vector
+ */
+template<typename NumericT>
+void reverse(viennacl::vector_base<NumericT> & in)
+{
+  vcl_size_t size = in.size();
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    NumericT val1 = in[i];
+    NumericT val2 = in[size - i - 1];
+    in[i] = val2;
+    in[size - i - 1] = val1;
+  }
+}
+
+}      //namespace host_based
+}      //namespace linalg
+}      //namespace viennacl
+
+#endif /* FFT_OPERATIONS_HPP_ */

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
new file mode 100644
index 0000000..62c885a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
@@ -0,0 +1,672 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/ilu_operations.hpp
+    @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using the host (OpenMP)
+*/
+
+#include <cmath>
+#include <algorithm>  //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+#include "viennacl/traits/stride.hpp"
+
+
+// Minimum vector size for using OpenMP on vector operations:
+#ifndef VIENNACL_OPENMP_ILU_MIN_SIZE
+  #define VIENNACL_OPENMP_ILU_MIN_SIZE  5000
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+template<typename NumericT>
+void extract_L(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L)
+{
+  // L is known to have correct dimensions
+
+  unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+  NumericT     const *A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
+
+  unsigned int       *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+
+  //
+  // Step 1: Count elements in L
+  //
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = A_row_buffer[row];
+    unsigned int col_end   = A_row_buffer[row+1];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = A_col_buffer[j];
+      if (long(col) <= row)
+        ++L_row_buffer[row];
+    }
+  }
+
+  //
+  // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices
+  //
+  viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1);
+  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer);
+  L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+  unsigned int       *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+
+  //
+  // Step 3: Write entries:
+  //
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = A_row_buffer[row];
+    unsigned int col_end   = A_row_buffer[row+1];
+
+    unsigned int index_L = L_row_buffer[row];
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = A_col_buffer[j];
+      NumericT value = A_elements[j];
+
+      if (long(col) <= row)
+      {
+        L_col_buffer[index_L] = col;
+        L_elements[index_L]   = value;
+        ++index_L;
+      }
+    }
+  }
+
+} // extract_L
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void icc_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L)
+{
+  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+  unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+  NumericT     const *A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
+
+  NumericT           *D_elements   = detail::extract_raw_pointer<NumericT>(D.handle());
+
+  //
+  // Step 1: Determine D
+  //
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = A_row_buffer[row];
+    unsigned int col_end   = A_row_buffer[row+1];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = A_col_buffer[j];
+      if (row == col)
+      {
+        D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j]));
+        break;
+      }
+    }
+  }
+
+  //
+  // Step 2: Scale values in L:
+  //
+  unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+  unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = L_row_buffer[row];
+    unsigned int col_end   = L_row_buffer[row+1];
+
+    NumericT D_row = D_elements[row];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+      L_elements[j] *= D_row * D_elements[L_col_buffer[j]];
+  }
+
+  L.generate_row_block_information();
+}
+
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ICC using OpenMP (cf. Algorithm 3 in paper, but for L rather than U) */
+template<typename NumericT>
+void icc_chow_patel_sweep(compressed_matrix<NumericT> & L,
+                          vector<NumericT>            & aij_L)
+{
+  unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+  unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+
+  NumericT           *aij_ptr   = detail::extract_raw_pointer<NumericT>(aij_L.handle());
+
+  // temporary workspace
+  NumericT *L_backup = (NumericT *)malloc(sizeof(NumericT) * L.nnz());
+
+  // backup:
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(L.nnz()); ++i)
+    L_backup[i] = L_elements[i];
+
+
+  // sweep
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(L.size1()); ++row)
+  {
+    //
+    // update L:
+    //
+    unsigned int row_Li_start = L_row_buffer[row];
+    unsigned int row_Li_end   = L_row_buffer[row + 1];
+
+    for (unsigned int i = row_Li_start; i < row_Li_end; ++i)
+    {
+      unsigned int col = L_col_buffer[i];
+
+      unsigned int row_Lj_start = L_row_buffer[col];
+      unsigned int row_Lj_end   = L_row_buffer[col+1];
+
+      // compute \sum_{k=1}^{j-1} l_ik l_jk
+      unsigned int index_Lj = row_Lj_start;
+      unsigned int col_Lj = L_col_buffer[index_Lj];
+      NumericT s = aij_ptr[i];
+      for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li)
+      {
+        unsigned int col_Li = L_col_buffer[index_Li];
+
+        // find element in row j
+        while (col_Lj < col_Li)
+        {
+          ++index_Lj;
+          col_Lj = L_col_buffer[index_Lj];
+        }
+
+        if (col_Lj == col_Li)
+          s -= L_backup[index_Li] * L_backup[index_Lj];
+      }
+
+      if (row != col)
+        L_elements[i] = s / L_backup[row_Lj_end - 1]; // diagonal element is last in row!
+      else
+        L_elements[i] = std::sqrt(s);
+    }
+  }
+
+  free(L_backup);
+}
+
+
+
+//////////////////////// ILU ////////////////////////
+
+template<typename NumericT>
+void extract_LU(compressed_matrix<NumericT> const & A,
+                compressed_matrix<NumericT>       & L,
+                compressed_matrix<NumericT>       & U)
+{
+  // L and U are known to have correct dimensions
+
+  unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+  NumericT     const *A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
+
+  unsigned int       *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+  unsigned int       *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
+
+  //
+  // Step 1: Count elements in L and U
+  //
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = A_row_buffer[row];
+    unsigned int col_end   = A_row_buffer[row+1];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = A_col_buffer[j];
+      if (long(col) <= row)
+        ++L_row_buffer[row];
+      if (long(col) >= row)
+        ++U_row_buffer[row];
+    }
+  }
+
+  //
+  // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices
+  //
+  viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1);
+  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer);
+  L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+  viennacl::vector_base<unsigned int> wrapped_U_row_buffer(U.handle1(), U.size1() + 1, 0, 1);
+  viennacl::linalg::exclusive_scan(wrapped_U_row_buffer);
+  U.reserve(wrapped_U_row_buffer[U.size1()], false);
+
+  unsigned int       *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+
+  unsigned int       *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
+  NumericT           *U_elements   = detail::extract_raw_pointer<NumericT>(U.handle());
+
+  //
+  // Step 3: Write entries:
+  //
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = A_row_buffer[row];
+    unsigned int col_end   = A_row_buffer[row+1];
+
+    unsigned int index_L = L_row_buffer[row];
+    unsigned int index_U = U_row_buffer[row];
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = A_col_buffer[j];
+      NumericT value = A_elements[j];
+
+      if (long(col) <= row)
+      {
+        L_col_buffer[index_L] = col;
+        L_elements[index_L]   = value;
+        ++index_L;
+      }
+
+      if (long(col) >= row)
+      {
+        U_col_buffer[index_U] = col;
+        U_elements[index_U]   = value;
+        ++index_U;
+      }
+    }
+  }
+
+} // extract_LU
+
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void ilu_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L,
+               compressed_matrix<NumericT>       & U)
+{
+  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+  unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+  NumericT     const *A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
+
+  NumericT           *D_elements   = detail::extract_raw_pointer<NumericT>(D.handle());
+
+  //
+  // Step 1: Determine D
+  //
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = A_row_buffer[row];
+    unsigned int col_end   = A_row_buffer[row+1];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = A_col_buffer[j];
+      if (row == col)
+      {
+        D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j]));
+        break;
+      }
+    }
+  }
+
+  //
+  // Step 2: Scale values in L:
+  //
+  unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+  unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = L_row_buffer[row];
+    unsigned int col_end   = L_row_buffer[row+1];
+
+    NumericT D_row = D_elements[row];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+      L_elements[j] *= D_row * D_elements[L_col_buffer[j]];
+  }
+
+  //
+  // Step 3: Scale values in U:
+  //
+  unsigned int const *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
+  unsigned int const *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
+  NumericT           *U_elements   = detail::extract_raw_pointer<NumericT>(U.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = U_row_buffer[row];
+    unsigned int col_end   = U_row_buffer[row+1];
+
+    NumericT D_row = D_elements[row];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+      U_elements[j] *= D_row * D_elements[U_col_buffer[j]];
+  }
+
+  L.generate_row_block_information();
+  // Note: block information for U will be generated after transposition
+
+}
+
+template<typename NumericT>
+void ilu_transpose(compressed_matrix<NumericT> const & A,
+                   compressed_matrix<NumericT>       & B)
+{
+  NumericT     const * A_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  // initialize datastructures for B:
+  B = compressed_matrix<NumericT>(A.size2(), A.size1(), A.nnz(), viennacl::traits::context(A));
+
+  NumericT     * B_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(B.handle());
+  unsigned int * B_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle1());
+  unsigned int * B_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle2());
+
+  // prepare uninitialized B_row_buffer:
+  for (std::size_t i = 0; i < B.size1(); ++i)
+    B_row_buffer[i] = 0;
+
+  //
+  // Stage 1: Compute pattern for B
+  //
+  for (std::size_t row = 0; row < A.size1(); ++row)
+  {
+    unsigned int row_start = A_row_buffer[row];
+    unsigned int row_stop  = A_row_buffer[row+1];
+
+    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+      B_row_buffer[A_col_buffer[nnz_index]] += 1;
+  }
+
+  // Bring row-start array in place using exclusive-scan:
+  unsigned int offset = B_row_buffer[0];
+  B_row_buffer[0] = 0;
+  for (std::size_t row = 1; row < B.size1(); ++row)
+  {
+    unsigned int tmp = B_row_buffer[row];
+    B_row_buffer[row] = offset;
+    offset += tmp;
+  }
+  B_row_buffer[B.size1()] = offset;
+
+  //
+  // Stage 2: Fill with data
+  //
+
+  std::vector<unsigned int> B_row_offsets(B.size1()); //number of elements already written per row
+
+  for (unsigned int row = 0; row < static_cast<unsigned int>(A.size1()); ++row)
+  {
+    //std::cout << "Row " << row << ": ";
+    unsigned int row_start = A_row_buffer[row];
+    unsigned int row_stop  = A_row_buffer[row+1];
+
+    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+    {
+      unsigned int col_in_A = A_col_buffer[nnz_index];
+      unsigned int B_nnz_index = B_row_buffer[col_in_A] + B_row_offsets[col_in_A];
+      B_col_buffer[B_nnz_index] = row;
+      B_elements[B_nnz_index] = A_elements[nnz_index];
+      ++B_row_offsets[col_in_A];
+      //B_temp.at(A_col_buffer[nnz_index])[row] = A_elements[nnz_index];
+    }
+  }
+
+  // Step 3: Make datastructure consistent (row blocks!)
+  B.generate_row_block_information();
+}
+
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void ilu_chow_patel_sweep(compressed_matrix<NumericT>       & L,
+                          vector<NumericT>            const & aij_L,
+                          compressed_matrix<NumericT>       & U_trans,
+                          vector<NumericT>            const & aij_U_trans)
+{
+  unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+  unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+
+  NumericT     const *aij_L_ptr    = detail::extract_raw_pointer<NumericT>(aij_L.handle());
+
+  unsigned int const *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U_trans.handle1());
+  unsigned int const *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U_trans.handle2());
+  NumericT           *U_elements   = detail::extract_raw_pointer<NumericT>(U_trans.handle());
+
+  NumericT     const *aij_U_trans_ptr = detail::extract_raw_pointer<NumericT>(aij_U_trans.handle());
+
+  // temporary workspace
+  NumericT *L_backup = new NumericT[L.nnz()];
+  NumericT *U_backup = new NumericT[U_trans.nnz()];
+
+  // backup:
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(L.nnz()); ++i)
+    L_backup[i] = L_elements[i];
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (U_trans.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(U_trans.nnz()); ++i)
+    U_backup[i] = U_elements[i];
+
+  // sweep
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(L.size1()); ++row)
+  {
+    //
+    // update L:
+    //
+    unsigned int row_L_start = L_row_buffer[row];
+    unsigned int row_L_end   = L_row_buffer[row + 1];
+
+    for (unsigned int j = row_L_start; j < row_L_end; ++j)
+    {
+      unsigned int col = L_col_buffer[j];
+
+      if (col == row)
+        continue;
+
+      unsigned int row_U_start = U_row_buffer[col];
+      unsigned int row_U_end   = U_row_buffer[col + 1];
+
+      // compute \sum_{k=1}^{j-1} l_ik u_kj
+      unsigned int index_U = row_U_start;
+      unsigned int col_U = (index_U < row_U_end) ? U_col_buffer[index_U] : static_cast<unsigned int>(U_trans.size2());
+      NumericT sum = 0;
+      for (unsigned int k = row_L_start; k < j; ++k)
+      {
+        unsigned int col_L = L_col_buffer[k];
+
+        // find element in U
+        while (col_U < col_L)
+        {
+          ++index_U;
+          col_U = U_col_buffer[index_U];
+        }
+
+        if (col_U == col_L)
+          sum += L_backup[k] * U_backup[index_U];
+      }
+
+      // update l_ij:
+      assert(U_col_buffer[row_U_end - 1] == col && bool("Accessing U element which is not a diagonal element!"));
+      L_elements[j] = (aij_L_ptr[j] - sum) / U_backup[row_U_end - 1];  // diagonal element is last entry in U
+    }
+
+
+    //
+    // update U:
+    //
+    unsigned int row_U_start = U_row_buffer[row];
+    unsigned int row_U_end   = U_row_buffer[row + 1];
+    for (unsigned int j = row_U_start; j < row_U_end; ++j)
+    {
+      unsigned int col = U_col_buffer[j];
+
+      row_L_start = L_row_buffer[col];
+      row_L_end   = L_row_buffer[col + 1];
+
+      // compute \sum_{k=1}^{j-1} l_ik u_kj
+      unsigned int index_L = row_L_start;
+      unsigned int col_L = (index_L < row_L_end) ? L_col_buffer[index_L] : static_cast<unsigned int>(L.size1());
+      NumericT sum = 0;
+      for (unsigned int k = row_U_start; k < j; ++k)
+      {
+        unsigned int col_U = U_col_buffer[k];
+
+        // find element in L
+        while (col_L < col_U)
+        {
+          ++index_L;
+          col_L = L_col_buffer[index_L];
+        }
+
+        if (col_U == col_L)
+          sum += L_backup[index_L] * U_backup[k];
+      }
+
+      // update u_ij:
+      U_elements[j] = aij_U_trans_ptr[j] - sum;
+    }
+  }
+
+  delete[] L_backup;
+  delete[] U_backup;
+}
+
+
+template<typename NumericT>
+void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
+                             vector<NumericT> & diag_R)
+{
+  unsigned int *R_row_buffer = detail::extract_raw_pointer<unsigned int>(R.handle1());
+  unsigned int *R_col_buffer = detail::extract_raw_pointer<unsigned int>(R.handle2());
+  NumericT     *R_elements   = detail::extract_raw_pointer<NumericT>(R.handle());
+
+  NumericT     *diag_R_ptr   = detail::extract_raw_pointer<NumericT>(diag_R.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (R.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(R.size1()); ++row)
+  {
+    unsigned int col_begin = R_row_buffer[row];
+    unsigned int col_end   = R_row_buffer[row+1];
+
+    // part 1: extract diagonal entry
+    NumericT diag = 0;
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = R_col_buffer[j];
+      if (col == row)
+      {
+        diag = R_elements[j];
+        R_elements[j] = 0; // (I - D^{-1}R)
+        break;
+      }
+    }
+    diag_R_ptr[row] = diag;
+
+    assert((diag > 0 || diag < 0) && bool("Zero diagonal detected!"));
+
+    // part2: scale
+    for (unsigned int j = col_begin; j < col_end; ++j)
+      R_elements[j] /= -diag;
+  }
+
+  //std::cout << "diag_R: " << diag_R << std::endl;
+}
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif


[27/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/amg/amg_base.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/amg/amg_base.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/amg/amg_base.hpp
new file mode 100644
index 0000000..8361308
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/amg/amg_base.hpp
@@ -0,0 +1,208 @@
+#ifndef VIENNACL_LINALG_DETAIL_AMG_AMG_BASE_HPP_
+#define VIENNACL_LINALG_DETAIL_AMG_AMG_BASE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file amg_base.hpp
+    @brief Helper classes and functions for the AMG preconditioner. Experimental.
+
+    AMG code contributed by Markus Wagner
+*/
+
+#include <cmath>
+#include <set>
+#include <list>
+#include <stdexcept>
+#include <algorithm>
+
+#include <map>
+#ifdef VIENNACL_WITH_OPENMP
+#include <omp.h>
+#endif
+
+#include "viennacl/context.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief Enumeration of coarsening methods for algebraic multigrid. */
+enum amg_coarsening_method
+{
+  AMG_COARSENING_METHOD_ONEPASS = 1,
+  AMG_COARSENING_METHOD_AGGREGATION,
+  AMG_COARSENING_METHOD_MIS2_AGGREGATION
+};
+
+/** @brief Enumeration of interpolation methods for algebraic multigrid. */
+enum amg_interpolation_method
+{
+  AMG_INTERPOLATION_METHOD_DIRECT = 1,
+  AMG_INTERPOLATION_METHOD_AGGREGATION,
+  AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION
+};
+
+
+/** @brief A tag for algebraic multigrid (AMG). Used to transport information from the user to the implementation.
+*/
+class amg_tag
+{
+public:
+  /** @brief The constructor, setting default values for the various parameters.
+    *
+    * Default coarsening routine: Aggreggation based on maximum independent sets of distance (MIS-2)
+    * Default interpolation routine: Smoothed aggregation
+    * Default threshold for strong connections: 0.1 (customizations are recommeded!)
+    * Default weight for Jacobi smoother: 1.0
+    * Default number of pre-smooth operations: 2
+    * Default number of post-smooth operations: 2
+    * Default number of coarse levels: 0 (this indicates that as many coarse levels as needed are constructed until the cutoff is reached)
+    * Default coarse grid size for direct solver (coarsening cutoff): 50
+    */
+  amg_tag()
+  : coarsening_method_(AMG_COARSENING_METHOD_MIS2_AGGREGATION), interpolation_method_(AMG_INTERPOLATION_METHOD_AGGREGATION),
+    strong_connection_threshold_(0.1), jacobi_weight_(1.0),
+    presmooth_steps_(2), postsmooth_steps_(2),
+    coarse_levels_(0), coarse_cutoff_(50) {}
+
+  // Getter-/Setter-Functions
+  /** @brief Sets the strategy used for constructing coarse grids  */
+  void set_coarsening_method(amg_coarsening_method s) { coarsening_method_ = s; }
+  /** @brief Returns the current coarsening strategy */
+  amg_coarsening_method get_coarsening_method() const { return coarsening_method_; }
+
+  /** @brief Sets the interpolation method to the provided method */
+  void set_interpolation_method(amg_interpolation_method interpol) { interpolation_method_ = interpol; }
+  /** @brief Returns the current interpolation method */
+  amg_interpolation_method get_interpolation_method() const { return interpolation_method_; }
+
+  /** @brief Sets the strong connection threshold. Customizations by the user essential for best results!
+    *
+    * With classical interpolation, a connection is considered strong if |a_ij| >= threshold * max_k(|a_ik|)
+    * Strength of connection currently ignored for aggregation-based coarsening (to be added in the future).
+    */
+  void set_strong_connection_threshold(double threshold) { if (threshold > 0) strong_connection_threshold_ = threshold; }
+  /** @brief Returns the strong connection threshold parameter.
+    *
+    * @see set_strong_connection_threshold() for an explanation of the threshold parameter
+    */
+  double get_strong_connection_threshold() const { return strong_connection_threshold_; }
+
+  /** @brief Sets the weight (damping) for the Jacobi smoother.
+    *
+    * The optimal value depends on the problem at hand. Values of 0.67 or 1.0 are usually a good starting point for further experiments.
+    */
+  void set_jacobi_weight(double w) { if (w > 0) jacobi_weight_ = w; }
+  /** @brief Returns the Jacobi smoother weight (damping). */
+  double get_jacobi_weight() const { return jacobi_weight_; }
+
+  /** @brief Sets the number of smoother applications on the fine level before restriction to the coarser level. */
+  void set_presmooth_steps(vcl_size_t steps) { presmooth_steps_ = steps; }
+  /** @brief Returns the number of smoother applications on the fine level before restriction to the coarser level. */
+  vcl_size_t get_presmooth_steps() const { return presmooth_steps_; }
+
+  /** @brief Sets the number of smoother applications on the coarse level before interpolation to the finer level. */
+  void set_postsmooth_steps(vcl_size_t steps) { postsmooth_steps_ = steps; }
+  /** @brief Returns the number of smoother applications on the coarse level before interpolation to the finer level. */
+  vcl_size_t get_postsmooth_steps() const { return postsmooth_steps_; }
+
+  /** @brief Sets the number of coarse levels. If set to zero, then coarse levels are constructed until the cutoff size is reached. */
+  void set_coarse_levels(vcl_size_t levels)  { coarse_levels_ = levels; }
+  /** @brief Returns the number of coarse levels. If zero, then coarse levels are constructed until the cutoff size is reached. */
+  vcl_size_t get_coarse_levels() const { return coarse_levels_; }
+
+  /** @brief Sets the coarse grid size for which the recursive multigrid scheme is stopped and a direct solver is used. */
+  void set_coarsening_cutoff(vcl_size_t size)  { coarse_cutoff_ = size; }
+  /** @brief Returns the coarse grid size for which the recursive multigrid scheme is stopped and a direct solver is used. */
+  vcl_size_t get_coarsening_cutoff() const { return coarse_cutoff_; }
+
+  /** @brief Sets the ViennaCL context for the setup stage. Set this to a host context if you want to run the setup on the host.
+    *
+    * Set the ViennaCL context for the solver application via set_target_context().
+    * Target and setup context can be different.
+    */
+  void set_setup_context(viennacl::context ctx)  { setup_ctx_ = ctx; }
+  /** @brief Returns the ViennaCL context for the preconditioenr setup. */
+  viennacl::context const & get_setup_context() const { return setup_ctx_; }
+
+  /** @brief Sets the ViennaCL context for the solver cycle stage (i.e. preconditioner applications).
+    *
+    * Since the cycle stage easily benefits from accelerators, you usually want to set this to a CUDA or OpenCL-enabled context.
+    */
+  void set_target_context(viennacl::context ctx)  { target_ctx_ = ctx; }
+  /** @brief Returns the ViennaCL context for the solver cycle stage (i.e. preconditioner applications). */
+  viennacl::context const & get_target_context() const { return target_ctx_; }
+
+private:
+  amg_coarsening_method coarsening_method_;
+  amg_interpolation_method interpolation_method_;
+  double strong_connection_threshold_, jacobi_weight_;
+  vcl_size_t presmooth_steps_, postsmooth_steps_, coarse_levels_, coarse_cutoff_;
+  viennacl::context setup_ctx_, target_ctx_;
+};
+
+
+namespace detail
+{
+namespace amg
+{
+
+
+  struct amg_level_context
+  {
+    void resize(vcl_size_t num_points, vcl_size_t max_nnz)
+    {
+      influence_jumper_.resize(num_points + 1, false);
+      influence_ids_.resize(max_nnz, false);
+      influence_values_.resize(num_points, false);
+      point_types_.resize(num_points, false);
+      coarse_id_.resize(num_points, false);
+    }
+
+    void switch_context(viennacl::context ctx)
+    {
+      influence_jumper_.switch_memory_context(ctx);
+      influence_ids_.switch_memory_context(ctx);
+      influence_values_.switch_memory_context(ctx);
+      point_types_.switch_memory_context(ctx);
+      coarse_id_.switch_memory_context(ctx);
+    }
+
+    enum
+    {
+      POINT_TYPE_UNDECIDED = 0,
+      POINT_TYPE_COARSE,
+      POINT_TYPE_FINE
+    } amg_point_types;
+
+    viennacl::vector<unsigned int> influence_jumper_; // similar to row_buffer for CSR matrices
+    viennacl::vector<unsigned int> influence_ids_;    // IDs of influencing points
+    viennacl::vector<unsigned int> influence_values_; // Influence measure for each point
+    viennacl::vector<unsigned int> point_types_;      // 0: undecided, 1: coarse point, 2: fine point. Using char here because type for enum might be a larger type
+    viennacl::vector<unsigned int> coarse_id_;        // coarse ID used on the next level. Only valid for coarse points. Fine points may (ab)use their entry for something else.
+    unsigned int num_coarse_;
+  };
+
+
+} //namespace amg
+}
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp
new file mode 100644
index 0000000..8308f77
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp
@@ -0,0 +1,191 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_KERNEL_CALLS_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_KERNEL_CALLS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp
+    @brief Kernel calls for the bisection algorithm
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+#ifdef VIENNACL_WITH_OPENCL
+   #include "viennacl/linalg/opencl/bisect_kernel_calls.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/bisect_kernel_calls.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+ template<typename NumericT>
+ void bisectSmall(const InputData<NumericT> &input, ResultDataSmall<NumericT> &result,
+                  const unsigned int mat_size,
+                  const NumericT lg, const NumericT ug,
+                  const NumericT precision)
+  {
+    switch (viennacl::traits::handle(input.g_a).get_active_handle_id())
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::bisectSmall(input, result,
+                                             mat_size,
+                                             lg,ug,
+                                             precision);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::bisectSmall(input, result,
+                                             mat_size,
+                                             lg,ug,
+                                             precision);
+        break;
+#endif
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+
+
+
+ template<typename NumericT>
+ void bisectLarge(const InputData<NumericT> &input, ResultDataLarge<NumericT> &result,
+                    const unsigned int mat_size,
+                    const NumericT lg, const NumericT ug,
+                    const NumericT precision)
+  {
+    switch (viennacl::traits::handle(input.g_a).get_active_handle_id())
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::bisectLarge(input, result,
+                                             mat_size,
+                                             lg,ug,
+                                             precision);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::bisectLarge(input, result,
+                                             mat_size,
+                                             lg,ug,
+                                             precision);
+        break;
+#endif
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+
+
+
+
+ template<typename NumericT>
+ void bisectLarge_OneIntervals(const InputData<NumericT> &input, ResultDataLarge<NumericT> &result,
+                    const unsigned int mat_size,
+                    const NumericT precision)
+  {
+    switch (viennacl::traits::handle(input.g_a).get_active_handle_id())
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::bisectLargeOneIntervals(input, result,
+                                             mat_size,
+                                             precision);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::bisectLarge_OneIntervals(input, result,
+                                             mat_size,
+                                             precision);
+
+        break;
+#endif
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+
+
+
+ template<typename NumericT>
+ void bisectLarge_MultIntervals(const InputData<NumericT> &input, ResultDataLarge<NumericT> &result,
+                    const unsigned int mat_size,
+                    const NumericT precision)
+  {
+    switch (viennacl::traits::handle(input.g_a).get_active_handle_id())
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::bisectLargeMultIntervals(input, result,
+                                           mat_size,
+                                           precision);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::bisectLarge_MultIntervals(input, result,
+                                             mat_size,
+                                             precision);
+        break;
+#endif
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+} // namespace detail
+} // namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_large.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_large.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_large.hpp
new file mode 100755
index 0000000..337858f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_large.hpp
@@ -0,0 +1,142 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_BISECT_LARGE_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_BISECT_LARGE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/detail//bisect/bisect_large.hpp
+    @brief Computation of eigenvalues of a large symmetric, tridiagonal matrix
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+// includes, system
+#include <iostream>
+#include <iomanip>  
+#include <stdlib.h>
+#include <stdio.h>
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+#include "viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+////////////////////////////////////////////////////////////////////////////////
+//! Run the kernels to compute the eigenvalues for large matrices
+//! @param  input   handles to input data
+//! @param  result  handles to result data
+//! @param  mat_size  matrix size
+//! @param  precision  desired precision of eigenvalues
+//! @param  lg  lower limit of Gerschgorin interval
+//! @param  ug  upper limit of Gerschgorin interval
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+void
+computeEigenvaluesLargeMatrix(InputData<NumericT> &input, ResultDataLarge<NumericT> &result,
+                              const unsigned int mat_size,
+                              const NumericT lg, const NumericT ug,  const NumericT precision)
+{
+
+
+  // First kernel call: decide on which intervals bisect_Large_OneIntervals/
+  // bisect_Large_MultIntervals is executed
+  viennacl::linalg::detail::bisectLarge(input, result, mat_size, lg, ug, precision);
+
+  // compute eigenvalues for intervals that contained only one eigenvalue
+  // after the first processing step
+  viennacl::linalg::detail::bisectLarge_OneIntervals(input, result, mat_size, precision);
+
+  // process intervals that contained more than one eigenvalue after
+  // the first processing step
+  viennacl::linalg::detail::bisectLarge_MultIntervals(input, result, mat_size, precision);
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Process the result, that is obtain result from device and do simple sanity
+//! checking
+//! @param  result  handles to result data
+//! @param  mat_size  matrix size
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+bool
+processResultDataLargeMatrix(ResultDataLarge<NumericT> &result,
+                             const unsigned int mat_size)
+{
+    bool bCompareResult = true;
+    // copy data from intervals that contained more than one eigenvalue after
+    // the first processing step
+    std::vector<NumericT> lambda_mult(mat_size);
+    viennacl::copy(result.g_lambda_mult, lambda_mult);
+
+    std::vector<unsigned int> pos_mult(mat_size);
+    viennacl::copy(result.g_pos_mult, pos_mult);
+
+    std::vector<unsigned int> blocks_mult_sum(mat_size);
+    viennacl::copy(result.g_blocks_mult_sum, blocks_mult_sum);
+
+    unsigned int num_one_intervals = result.g_num_one;
+    unsigned int sum_blocks_mult = mat_size - num_one_intervals;
+
+
+    // copy data for intervals that contained one eigenvalue after the first
+    // processing step
+    std::vector<NumericT> left_one(mat_size);
+    std::vector<NumericT> right_one(mat_size);
+    std::vector<unsigned int> pos_one(mat_size);
+
+    viennacl::copy(result.g_left_one, left_one);
+    viennacl::copy(result.g_right_one, right_one);
+    viennacl::copy(result.g_pos_one, pos_one);
+
+
+    // singleton intervals generated in the second step
+    for (unsigned int i = 0; i < sum_blocks_mult; ++i)
+    {
+      if (pos_mult[i] != 0)
+        result.std_eigenvalues[pos_mult[i] - 1] = lambda_mult[i];
+
+      else
+      {
+        throw memory_exception("Invalid array index! Are there more than 256 equal eigenvalues?");
+      }
+    }
+
+    // singleton intervals generated in the first step
+    unsigned int index = 0;
+
+    for (unsigned int i = 0; i < num_one_intervals; ++i, ++index)
+    {
+        result.std_eigenvalues[pos_one[i] - 1] = left_one[i];
+    }
+    return bCompareResult;
+}
+} // namespace detail
+}  // namespace linalg
+}  // namespace viennacl
+#endif //VIENNACL_LINALG_DETAIL_BISECT_LARGE_HPP_

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_small.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_small.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_small.hpp
new file mode 100755
index 0000000..144640b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_small.hpp
@@ -0,0 +1,96 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_SMALL_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_SMALL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/detail//bisect/bisect_small.hpp
+    @brief Computation of eigenvalues of a small symmetric, tridiagonal matrix
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, system
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+
+// includes, project
+
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+
+// includes, kernels
+#include "viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Determine eigenvalues for matrices smaller than MAX_SMALL_MATRIX
+//! @param  input  handles to input data of kernel
+//! @param  result handles to result of kernel
+//! @param  mat_size  matrix size
+//! @param  lg  lower limit of Gerschgorin interval
+//! @param  ug  upper limit of Gerschgorin interval
+//! @param  precision  desired precision of eigenvalues
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+void
+computeEigenvaluesSmallMatrix(const InputData<NumericT> &input, ResultDataSmall<NumericT> &result,
+                              const unsigned int mat_size,
+                              const NumericT lg, const NumericT ug,
+                              const NumericT precision)
+{
+  viennacl::linalg::detail::bisectSmall( input, result, mat_size, lg, ug, precision);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Process the result obtained on the device, that is transfer to host and
+//! perform basic sanity checking
+//! @param  result  handles to result data
+//! @param  mat_size   matrix size
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+void
+processResultSmallMatrix(ResultDataSmall<NumericT> &result,
+                         const unsigned int mat_size)
+{
+  // copy data back to host
+  std::vector<NumericT> left(mat_size);
+  std::vector<unsigned int> left_count(mat_size);
+
+  viennacl::copy(result.vcl_g_left, left);
+  viennacl::copy(result.vcl_g_left_count, left_count);
+
+  for (unsigned int i = 0; i < mat_size; ++i)
+  {
+      result.std_eigenvalues[left_count[i]] = left[i];
+  }
+}
+}  // namespace detail
+}  // namespace linalg
+} // namespace viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/config.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/config.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/config.hpp
new file mode 100755
index 0000000..3afa509
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/config.hpp
@@ -0,0 +1,44 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_CONFIG_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_CONFIG_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+
+/** @file viennacl/linalg/detail//bisect/config.hpp
+ *     @brief Global configuration parameters
+ *
+ *         Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ *             the creation of derivative works is allowed by including the following statement:
+ *                 "This software contains source code provided by NVIDIA Corporation."
+ *                 */
+
+// should be power of two
+#define  VIENNACL_BISECT_MAX_THREADS_BLOCK                256
+
+#ifdef VIENNACL_WITH_OPENCL
+#  define VIENNACL_BISECT_MAX_SMALL_MATRIX                 256
+#  define VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX   256
+#else                                                          // if CUDA is used
+#  define VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX   512 // change to 256 if errors occur
+#  define VIENNACL_BISECT_MAX_SMALL_MATRIX                 512 // change to 256 if errors occur
+#endif
+
+ #define  VIENNACL_BISECT_MIN_ABS_INTERVAL                 5.0e-37
+
+#endif 

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/gerschgorin.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/gerschgorin.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/gerschgorin.hpp
new file mode 100755
index 0000000..53cd863
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/gerschgorin.hpp
@@ -0,0 +1,94 @@
+#ifndef _VIENNACL_LINALG_DETAIL_BISECT_GERSCHORIN_HPP_
+#define _VIENNACL_LINALG_DETAIL_BISECT_GERSCHORIN_HPP_
+
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/detail//bisect/gerschgorin.hpp
+    @brief  Computation of Gerschgorin interval for symmetric, tridiagonal matrix
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <cfloat>
+
+#include "viennacl/linalg/detail/bisect/util.hpp"
+#include "viennacl/vector.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  //! Compute Gerschgorin interval for symmetric, tridiagonal matrix
+  //! @param  d  diagonal elements
+  //! @param  s  superdiagonal elements
+  //! @param  n  size of matrix
+  //! @param  lg  lower limit of Gerschgorin interval
+  //! @param  ug  upper limit of Gerschgorin interval
+  ////////////////////////////////////////////////////////////////////////////////
+  template<typename NumericT>
+  void
+  computeGerschgorin(std::vector<NumericT> & d, std::vector<NumericT> & s, unsigned int n, NumericT &lg, NumericT &ug)
+  {
+      // compute bounds
+      for (unsigned int i = 1; i < (n - 1); ++i)
+      {
+
+          // sum over the absolute values of all elements of row i
+          NumericT sum_abs_ni = fabsf(s[i]) + fabsf(s[i + 1]);
+
+          lg = min(lg, d[i] - sum_abs_ni);
+          ug = max(ug, d[i] + sum_abs_ni);
+      }
+
+      // first and last row, only one superdiagonal element
+
+      // first row
+      lg = min(lg, d[0] - fabsf(s[1]));
+      ug = max(ug, d[0] + fabsf(s[1]));
+
+      // last row
+      lg = min(lg, d[n-1] - fabsf(s[n-1]));
+      ug = max(ug, d[n-1] + fabsf(s[n-1]));
+
+      // increase interval to avoid side effects of fp arithmetic
+      NumericT bnorm = max(fabsf(ug), fabsf(lg));
+
+      // these values depend on the implmentation of floating count that is
+      // employed in the following
+      NumericT psi_0 = 11 * FLT_EPSILON * bnorm;
+      NumericT psi_n = 11 * FLT_EPSILON * bnorm;
+
+      lg = lg - bnorm * 2 * static_cast<NumericT>(n) * FLT_EPSILON - psi_0;
+      ug = ug + bnorm * 2 * static_cast<NumericT>(n) * FLT_EPSILON + psi_n;
+
+      ug = max(lg, ug);
+  }
+}  // namespace detail
+}  // namespace linalg
+} // namespace viennacl
+#endif  // _VIENNACL_LINALG_DETAIL_GERSCHORIN_H_

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/structs.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/structs.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/structs.hpp
new file mode 100755
index 0000000..1943da3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/structs.hpp
@@ -0,0 +1,182 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_STRUCTS_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_STRUCTS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/detail//bisect/structs.hpp
+    @brief  Helper structures to simplify variable handling
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include <assert.h>
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+/////////////////////////////////////////////////////////////////////////////////
+//! In this class the input matrix is stored
+/////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+struct InputData
+{
+  //! host side representation of diagonal
+  std::vector<NumericT> std_a;
+  //! host side representation superdiagonal
+  std::vector<NumericT> std_b;
+  //! device side representation of diagonal
+  viennacl::vector<NumericT> g_a;
+  //!device side representation of superdiagonal
+  viennacl::vector<NumericT> g_b;
+
+  /** @brief Initialize the input data to the algorithm
+   *
+   * @param diagonal        vector with the diagonal elements
+   * @param superdiagonal   vector with the superdiagonal elements
+   * @param sz              size of the matrix
+   */
+  InputData(std::vector<NumericT> diagonal, std::vector<NumericT> superdiagonal, const unsigned int sz) :
+              std_a(sz), std_b(sz), g_a(sz), g_b(sz)
+  {
+   std_a = diagonal;
+   std_b = superdiagonal;
+
+   viennacl::copy(std_b, g_b);
+   viennacl::copy(std_a, g_a);
+  }
+
+  InputData(viennacl::vector<NumericT> diagonal, viennacl::vector<NumericT> superdiagonal, const unsigned int sz) :
+              std_a(sz), std_b(sz), g_a(sz), g_b(sz)
+  {
+   g_a = diagonal;
+   g_b = superdiagonal;
+
+   viennacl::copy(g_a, std_a);
+   viennacl::copy(g_b, std_b);
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////
+//! In this class the data of the result for small matrices is stored
+/////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+struct ResultDataSmall
+{
+  //! eigenvalues (host side)
+  std::vector<NumericT> std_eigenvalues;
+  //! left interval limits at the end of the computation
+  viennacl::vector<NumericT> vcl_g_left;
+  //! right interval limits at the end of the computation
+  viennacl::vector<NumericT> vcl_g_right;
+  //! number of eigenvalues smaller than the left interval limit
+  viennacl::vector<unsigned int> vcl_g_left_count;
+  //! number of eigenvalues bigger than the right interval limit
+  viennacl::vector<unsigned int> vcl_g_right_count;
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  //! Initialize variables and memory for the result for small matrices
+  ////////////////////////////////////////////////////////////////////////////////
+  ResultDataSmall(const unsigned int mat_size) :
+    std_eigenvalues(mat_size), vcl_g_left(mat_size), vcl_g_right(mat_size), vcl_g_left_count(mat_size), vcl_g_right_count(mat_size) {}
+};
+
+
+
+
+
+/////////////////////////////////////////////////////////////////////////////////
+//! In this class the data of the result for large matrices is stored
+/////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+struct ResultDataLarge
+{
+//! eigenvalues
+  std::vector<NumericT> std_eigenvalues;
+
+  //! number of intervals containing one eigenvalue after the first step
+  viennacl::scalar<unsigned int> g_num_one;
+
+  //! number of (thread) blocks of intervals containing multiple eigenvalues after the first steo
+  viennacl::scalar<unsigned int> g_num_blocks_mult;
+
+  //! left interval limits of intervals containing one eigenvalue after the first iteration step
+  viennacl::vector<NumericT> g_left_one;
+
+  //! right interval limits of intervals containing one eigenvalue after the first iteration step
+  viennacl::vector<NumericT> g_right_one;
+
+  //! interval indices (position in sorted listed of eigenvalues) of intervals containing one eigenvalue after the first iteration step
+  viennacl::vector<unsigned int> g_pos_one;
+
+  //! left interval limits of intervals containing multiple eigenvalues after the first iteration step
+  viennacl::vector<NumericT> g_left_mult;
+  //! right interval limits of intervals containing multiple eigenvalues after the first iteration step
+  viennacl::vector<NumericT> g_right_mult;
+
+  //! number of eigenvalues less than the left limit of the eigenvalue intervals containing multiple eigenvalues
+  viennacl::vector<unsigned int> g_left_count_mult;
+
+  //! number of eigenvalues less than the right limit of the eigenvalue intervals containing multiple eigenvalues
+  viennacl::vector<unsigned int> g_right_count_mult;
+  //! start addresses in g_left_mult etc. of blocks of intervals containing more than one eigenvalue after the first step
+  viennacl::vector<unsigned int> g_blocks_mult;
+
+  //! accumulated number of intervals in g_left_mult etc. of blocks of intervals containing more than one eigenvalue after the first step
+  viennacl::vector<unsigned int> g_blocks_mult_sum;
+
+  //! eigenvalues that have been generated in the second step from intervals that still contained multiple eigenvalues after the first step
+  viennacl::vector<NumericT> g_lambda_mult;
+
+  //! eigenvalue index of intervals that have been generated in the second processing step
+  viennacl::vector<unsigned int> g_pos_mult;
+
+  /** @brief Initialize variables and memory for result
+   *
+   * @param  mat_size  size of the matrix
+   */
+  ResultDataLarge(unsigned int mat_size) :
+    std_eigenvalues(mat_size), g_num_one(0), g_num_blocks_mult(0),
+    g_left_one(mat_size), g_right_one(mat_size), g_pos_one(mat_size),
+    g_left_mult(mat_size), g_right_mult(mat_size), g_left_count_mult(mat_size), g_right_count_mult(mat_size),
+    g_blocks_mult(mat_size), g_blocks_mult_sum(mat_size), g_lambda_mult(mat_size), g_pos_mult(mat_size) {}
+
+};
+} // namespace detail
+} // namespace linalg
+} // namespace viennacl
+#endif // #ifndef VIENNACL_LINALG_DETAIL_STRUCTS_HPP_
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/util.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/util.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/util.hpp
new file mode 100755
index 0000000..883d202
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/util.hpp
@@ -0,0 +1,106 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/detail//bisect/util.hpp
+    @brief Utility functions
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+////////////////////////////////////////////////////////////////////////////////
+//! Minimum
+////////////////////////////////////////////////////////////////////////////////
+template<class T>
+#ifdef __CUDACC__
+__host__  __device__
+#endif
+T
+min(const T &lhs, const T &rhs)
+{
+
+    return (lhs < rhs) ? lhs : rhs;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Maximum
+////////////////////////////////////////////////////////////////////////////////
+template<class T>
+#ifdef __CUDACC__
+__host__  __device__
+#endif
+T
+max(const T &lhs, const T &rhs)
+{
+
+    return (lhs < rhs) ? rhs : lhs;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Sign of number (float)
+////////////////////////////////////////////////////////////////////////////////
+#ifdef __CUDACC__
+__host__  __device__
+#endif
+inline float
+sign_f(const float &val)
+{
+    return (val < 0.0f) ? -1.0f : 1.0f;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Sign of number (double)
+////////////////////////////////////////////////////////////////////////////////
+#ifdef __CUDACC__
+__host__  __device__
+#endif
+inline double
+sign_d(const double &val)
+{
+    return (val < 0.0) ? -1.0 : 1.0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//! Get the number of blocks that are required to process \a num_threads with
+//! \a num_threads_blocks threads per block
+///////////////////////////////////////////////////////////////////////////////
+extern "C"
+inline
+unsigned int
+getNumBlocksLinear(const unsigned int num_threads,
+                   const unsigned int num_threads_block)
+{
+    const unsigned int block_rem =
+        ((num_threads % num_threads_block) != 0) ? 1 : 0;
+    return (num_threads / num_threads_block) + block_rem;
+}
+} // namespace detail
+} // namespace linalg
+} // namespace viennacl
+#endif // #ifndef VIENNACL_LINALG_DETAIL_UTIL_HPP_

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/block_ilu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/block_ilu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/block_ilu.hpp
new file mode 100644
index 0000000..1540e2d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/block_ilu.hpp
@@ -0,0 +1,617 @@
+#ifndef VIENNACL_LINALG_DETAIL_BLOCK_ILU_HPP_
+#define VIENNACL_LINALG_DETAIL_BLOCK_ILU_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/block_ilu.hpp
+    @brief Implementations of incomplete block factorization preconditioners
+*/
+
+#include <vector>
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/linalg/detail/ilu/ilu0.hpp"
+#include "viennacl/linalg/detail/ilu/ilut.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+  /** @brief Helper range class for representing a subvector of a larger buffer. */
+  template<typename VectorT, typename NumericT, typename SizeT = vcl_size_t>
+  class ilu_vector_range
+  {
+  public:
+    ilu_vector_range(VectorT & v,
+                     SizeT start_index,
+                     SizeT vec_size
+                    ) : vec_(v), start_(start_index), size_(vec_size) {}
+
+    NumericT & operator()(SizeT index)
+    {
+      assert(index < size_ && bool("Index out of bounds!"));
+      return vec_[start_ + index];
+    }
+
+    NumericT & operator[](SizeT index)
+    {
+      assert(index < size_ && bool("Index out of bounds!"));
+      return vec_[start_ + index];
+    }
+
+    SizeT size() const { return size_; }
+
+  private:
+    VectorT & vec_;
+    SizeT start_;
+    SizeT size_;
+  };
+
+  /** @brief Extracts a diagonal block from a larger system matrix
+    *
+    * @param A                   The full matrix
+    * @param diagonal_block_A    The output matrix, to which the extracted block is written to
+    * @param start_index         First row- and column-index of the block
+    * @param stop_index          First row- and column-index beyond the block
+    */
+  template<typename NumericT>
+  void extract_block_matrix(viennacl::compressed_matrix<NumericT> const & A,
+                            viennacl::compressed_matrix<NumericT>       & diagonal_block_A,
+                            vcl_size_t start_index,
+                            vcl_size_t stop_index
+                            )
+  {
+    assert( (A.handle1().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+    assert( (A.handle2().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+    assert( (A.handle().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+
+    NumericT     const * A_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+    unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+    unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+    NumericT     * output_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(diagonal_block_A.handle());
+    unsigned int * output_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(diagonal_block_A.handle1());
+    unsigned int * output_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(diagonal_block_A.handle2());
+
+    vcl_size_t output_counter = 0;
+    for (vcl_size_t row = start_index; row < stop_index; ++row)
+    {
+      unsigned int buffer_col_start = A_row_buffer[row];
+      unsigned int buffer_col_end   = A_row_buffer[row+1];
+
+      output_row_buffer[row - start_index] = static_cast<unsigned int>(output_counter);
+
+      for (unsigned int buf_index = buffer_col_start; buf_index < buffer_col_end; ++buf_index)
+      {
+        unsigned int col = A_col_buffer[buf_index];
+        if (col < start_index)
+          continue;
+
+        if (col >= static_cast<unsigned int>(stop_index))
+          continue;
+
+        output_col_buffer[output_counter] = static_cast<unsigned int>(col - start_index);
+        output_elements[output_counter] = A_elements[buf_index];
+        ++output_counter;
+      }
+      output_row_buffer[row - start_index + 1] = static_cast<unsigned int>(output_counter);
+    }
+  }
+
+} // namespace detail
+
+
+
+/** @brief A block ILU preconditioner class, can be supplied to solve()-routines
+ *
+ * @tparam MatrixType   Type of the system matrix
+ * @tparam ILUTag       Type of the tag identifiying the ILU preconditioner to be used on each block.
+*/
+template<typename MatrixT, typename ILUTag>
+class block_ilu_precond
+{
+typedef typename MatrixT::value_type      ScalarType;
+
+public:
+  typedef std::vector<std::pair<vcl_size_t, vcl_size_t> >    index_vector_type;   //the pair refers to index range [a, b) of each block
+
+
+  block_ilu_precond(MatrixT const & mat,
+                    ILUTag const & tag,
+                    vcl_size_t num_blocks = 8
+                   ) : tag_(tag), L_blocks(num_blocks), U_blocks(num_blocks)
+  {
+    // Set up vector of block indices:
+    block_indices_.resize(num_blocks);
+    for (vcl_size_t i=0; i<num_blocks; ++i)
+    {
+      vcl_size_t start_index = (   i  * mat.size1()) / num_blocks;
+      vcl_size_t stop_index  = ((i+1) * mat.size1()) / num_blocks;
+
+      block_indices_[i] = std::pair<vcl_size_t, vcl_size_t>(start_index, stop_index);
+    }
+
+    //initialize preconditioner:
+    //std::cout << "Start CPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End CPU precond" << std::endl;
+  }
+
+  block_ilu_precond(MatrixT const & mat,
+                    ILUTag const & tag,
+                    index_vector_type const & block_boundaries
+                   ) : tag_(tag), block_indices_(block_boundaries), L_blocks(block_boundaries.size()), U_blocks(block_boundaries.size())
+  {
+    //initialize preconditioner:
+    //std::cout << "Start CPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End CPU precond" << std::endl;
+  }
+
+
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    for (vcl_size_t i=0; i<block_indices_.size(); ++i)
+      apply_dispatch(vec, i, ILUTag());
+  }
+
+private:
+  void init(MatrixT const & A)
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    viennacl::compressed_matrix<ScalarType> mat(host_context);
+
+    viennacl::copy(A, mat);
+
+    unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(mat.handle1());
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long i2=0; i2<static_cast<long>(block_indices_.size()); ++i2)
+    {
+      vcl_size_t i = static_cast<vcl_size_t>(i2);
+      // Step 1: Extract blocks
+      vcl_size_t block_size = block_indices_[i].second - block_indices_[i].first;
+      vcl_size_t block_nnz  = row_buffer[block_indices_[i].second] - row_buffer[block_indices_[i].first];
+      viennacl::compressed_matrix<ScalarType> mat_block(block_size, block_size, block_nnz, host_context);
+
+      detail::extract_block_matrix(mat, mat_block, block_indices_[i].first, block_indices_[i].second);
+
+      // Step 2: Precondition blocks:
+      viennacl::switch_memory_context(L_blocks[i], host_context);
+      viennacl::switch_memory_context(U_blocks[i], host_context);
+      init_dispatch(mat_block, L_blocks[i], U_blocks[i], tag_);
+    }
+
+  }
+
+  void init_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+                     viennacl::compressed_matrix<ScalarType> & L,
+                     viennacl::compressed_matrix<ScalarType> & U,
+                     viennacl::linalg::ilu0_tag)
+  {
+    (void)U;
+    L = mat_block;
+    viennacl::linalg::precondition(L, tag_);
+  }
+
+  void init_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+                     viennacl::compressed_matrix<ScalarType> & L,
+                     viennacl::compressed_matrix<ScalarType> & U,
+                     viennacl::linalg::ilut_tag)
+  {
+    L.resize(mat_block.size1(), mat_block.size2());
+    U.resize(mat_block.size1(), mat_block.size2());
+    viennacl::linalg::precondition(mat_block, L, U, tag_);
+  }
+
+  template<typename VectorT>
+  void apply_dispatch(VectorT & vec, vcl_size_t i, viennacl::linalg::ilu0_tag) const
+  {
+    detail::ilu_vector_range<VectorT, ScalarType> vec_range(vec, block_indices_[i].first, L_blocks[i].size2());
+
+    unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks[i].handle1());
+    unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks[i].handle2());
+    ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(L_blocks[i].handle());
+
+    viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_range, L_blocks[i].size2(), unit_lower_tag());
+    viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_range, L_blocks[i].size2(), upper_tag());
+  }
+
+  template<typename VectorT>
+  void apply_dispatch(VectorT & vec, vcl_size_t i, viennacl::linalg::ilut_tag) const
+  {
+    detail::ilu_vector_range<VectorT, ScalarType> vec_range(vec, block_indices_[i].first, L_blocks[i].size2());
+
+    {
+      unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks[i].handle1());
+      unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks[i].handle2());
+      ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(L_blocks[i].handle());
+
+      viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_range, L_blocks[i].size2(), unit_lower_tag());
+    }
+
+    {
+      unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_blocks[i].handle1());
+      unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_blocks[i].handle2());
+      ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(U_blocks[i].handle());
+
+      viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_range, U_blocks[i].size2(), upper_tag());
+    }
+  }
+
+  ILUTag tag_;
+  index_vector_type block_indices_;
+  std::vector< viennacl::compressed_matrix<ScalarType> > L_blocks;
+  std::vector< viennacl::compressed_matrix<ScalarType> > U_blocks;
+};
+
+
+
+
+
+/** @brief ILUT preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV, typename ILUTagT>
+class block_ilu_precond< compressed_matrix<NumericT, AlignmentV>, ILUTagT>
+{
+  typedef compressed_matrix<NumericT, AlignmentV>        MatrixType;
+
+public:
+  typedef std::vector<std::pair<vcl_size_t, vcl_size_t> >    index_vector_type;   //the pair refers to index range [a, b) of each block
+
+
+  block_ilu_precond(MatrixType const & mat,
+                    ILUTagT const & tag,
+                    vcl_size_t num_blocks = 8
+                   ) : tag_(tag),
+                       block_indices_(num_blocks),
+                       gpu_block_indices_(),
+                       gpu_L_trans_(0, 0, viennacl::context(viennacl::MAIN_MEMORY)),
+                       gpu_U_trans_(0, 0, viennacl::context(viennacl::MAIN_MEMORY)),
+                       gpu_D_(mat.size1(), viennacl::context(viennacl::MAIN_MEMORY)),
+                       L_blocks_(num_blocks),
+                       U_blocks_(num_blocks)
+  {
+    // Set up vector of block indices:
+    block_indices_.resize(num_blocks);
+    for (vcl_size_t i=0; i<num_blocks; ++i)
+    {
+      vcl_size_t start_index = (   i  * mat.size1()) / num_blocks;
+      vcl_size_t stop_index  = ((i+1) * mat.size1()) / num_blocks;
+
+      block_indices_[i] = std::pair<vcl_size_t, vcl_size_t>(start_index, stop_index);
+    }
+
+    //initialize preconditioner:
+    //std::cout << "Start CPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End CPU precond" << std::endl;
+  }
+
+  block_ilu_precond(MatrixType const & mat,
+                    ILUTagT const & tag,
+                    index_vector_type const & block_boundaries
+                   ) : tag_(tag),
+                       block_indices_(block_boundaries),
+                       gpu_block_indices_(),
+                       gpu_L_trans_(0, 0, viennacl::context(viennacl::MAIN_MEMORY)),
+                       gpu_U_trans_(0, 0, viennacl::context(viennacl::MAIN_MEMORY)),
+                       gpu_D_(mat.size1(), viennacl::context(viennacl::MAIN_MEMORY)),
+                       L_blocks_(block_boundaries.size()),
+                       U_blocks_(block_boundaries.size())
+  {
+    //initialize preconditioner:
+    //std::cout << "Start CPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End CPU precond" << std::endl;
+  }
+
+
+  void apply(vector<NumericT> & vec) const
+  {
+    viennacl::linalg::detail::block_inplace_solve(trans(gpu_L_trans_), gpu_block_indices_, block_indices_.size(), gpu_D_,
+                                                  vec,
+                                                  viennacl::linalg::unit_lower_tag());
+
+    viennacl::linalg::detail::block_inplace_solve(trans(gpu_U_trans_), gpu_block_indices_, block_indices_.size(), gpu_D_,
+                                                  vec,
+                                                  viennacl::linalg::upper_tag());
+
+    //apply_cpu(vec);
+  }
+
+
+private:
+
+  void init(MatrixType const & A)
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    viennacl::compressed_matrix<NumericT> mat(host_context);
+
+    mat = A;
+
+    unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(mat.handle1());
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long i=0; i<static_cast<long>(block_indices_.size()); ++i)
+    {
+      // Step 1: Extract blocks
+      vcl_size_t block_size = block_indices_[static_cast<vcl_size_t>(i)].second - block_indices_[static_cast<vcl_size_t>(i)].first;
+      vcl_size_t block_nnz  = row_buffer[block_indices_[static_cast<vcl_size_t>(i)].second] - row_buffer[block_indices_[static_cast<vcl_size_t>(i)].first];
+      viennacl::compressed_matrix<NumericT> mat_block(block_size, block_size, block_nnz, host_context);
+
+      detail::extract_block_matrix(mat, mat_block, block_indices_[static_cast<vcl_size_t>(i)].first, block_indices_[static_cast<vcl_size_t>(i)].second);
+
+      // Step 2: Precondition blocks:
+      viennacl::switch_memory_context(L_blocks_[static_cast<vcl_size_t>(i)], host_context);
+      viennacl::switch_memory_context(U_blocks_[static_cast<vcl_size_t>(i)], host_context);
+      init_dispatch(mat_block, L_blocks_[static_cast<vcl_size_t>(i)], U_blocks_[static_cast<vcl_size_t>(i)], tag_);
+    }
+
+    /*
+     * copy resulting preconditioner back to GPU:
+     */
+    viennacl::backend::typesafe_host_array<unsigned int> block_indices_uint(gpu_block_indices_, 2 * block_indices_.size());
+    for (vcl_size_t i=0; i<block_indices_.size(); ++i)
+    {
+      block_indices_uint.set(2*i,     block_indices_[i].first);
+      block_indices_uint.set(2*i + 1, block_indices_[i].second);
+    }
+
+    viennacl::backend::memory_create(gpu_block_indices_, block_indices_uint.raw_size(), viennacl::traits::context(A), block_indices_uint.get());
+
+    blocks_to_device(A);
+
+  }
+
+  // Copy computed preconditioned blocks to OpenCL device
+  void blocks_to_device(MatrixType const & A)
+  {
+    gpu_L_trans_.resize(A.size1(), A.size2());
+    gpu_U_trans_.resize(A.size1(), A.size2());
+    gpu_D_.resize(A.size1());
+
+    unsigned int * L_trans_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(gpu_L_trans_.handle1());
+    unsigned int * U_trans_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(gpu_U_trans_.handle1());
+
+    //
+    // Count elements per row
+    //
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long block_index2 = 0; block_index2 < static_cast<long>(L_blocks_.size()); ++block_index2)
+    {
+      vcl_size_t block_index = vcl_size_t(block_index2);
+
+      unsigned int block_start = static_cast<unsigned int>(block_indices_[block_index].first);
+      unsigned int block_stop  = static_cast<unsigned int>(block_indices_[block_index].second);
+
+      unsigned int const * L_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks_[block_index].handle1());
+      unsigned int const * L_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks_[block_index].handle2());
+
+      // zero row array of L:
+      std::fill(L_trans_row_buffer + block_start,
+                L_trans_row_buffer + block_stop,
+                static_cast<unsigned int>(0));
+
+      // count number of elements per row:
+      for (vcl_size_t row = 0; row < L_blocks_[block_index].size1(); ++row)
+      {
+        unsigned int col_start = L_row_buffer[row];
+        unsigned int col_end   = L_row_buffer[row+1];
+
+        for (unsigned int j = col_start; j < col_end; ++j)
+        {
+          unsigned int col = L_col_buffer[j];
+          if (col < static_cast<unsigned int>(row))
+            L_trans_row_buffer[col + block_start] += 1;
+        }
+      }
+
+      ////// same for U
+
+      unsigned int const * U_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_blocks_[block_index].handle1());
+      unsigned int const * U_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_blocks_[block_index].handle2());
+
+      // zero row array of U:
+      std::fill(U_trans_row_buffer + block_start,
+                U_trans_row_buffer + block_stop,
+                static_cast<unsigned int>(0));
+
+      // count number of elements per row:
+      for (vcl_size_t row = 0; row < U_blocks_[block_index].size1(); ++row)
+      {
+        unsigned int col_start = U_row_buffer[row];
+        unsigned int col_end   = U_row_buffer[row+1];
+
+        for (unsigned int j = col_start; j < col_end; ++j)
+        {
+          unsigned int col = U_col_buffer[j];
+          if (col > row)
+            U_trans_row_buffer[col + block_start] += 1;
+        }
+      }
+    }
+
+
+    //
+    // Exclusive scan on row buffer (feel free to add parallelization here)
+    //
+    unsigned int current_value = 0;
+    for (vcl_size_t i=0; i<gpu_L_trans_.size1(); ++i)
+    {
+      unsigned int tmp = L_trans_row_buffer[i];
+      L_trans_row_buffer[i] = current_value;
+      current_value += tmp;
+    }
+    gpu_L_trans_.reserve(current_value);
+
+    current_value = 0;
+    for (vcl_size_t i=0; i<gpu_U_trans_.size1(); ++i)
+    {
+      unsigned int tmp = U_trans_row_buffer[i];
+      U_trans_row_buffer[i] = current_value;
+      current_value += tmp;
+    }
+    gpu_U_trans_.reserve(current_value);
+
+
+    //
+    // Fill with data
+    //
+    unsigned int       * L_trans_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(gpu_L_trans_.handle2());
+    NumericT           * L_trans_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(gpu_L_trans_.handle());
+
+    unsigned int       * U_trans_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(gpu_U_trans_.handle2());
+    NumericT           * U_trans_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(gpu_U_trans_.handle());
+
+    NumericT           * D_elements         = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(gpu_D_.handle());
+
+    std::vector<unsigned int> offset_L(gpu_L_trans_.size1());
+    std::vector<unsigned int> offset_U(gpu_U_trans_.size1());
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long block_index2 = 0; block_index2 < static_cast<long>(L_blocks_.size()); ++block_index2)
+    {
+      vcl_size_t   block_index = vcl_size_t(block_index2);
+      unsigned int block_start = static_cast<unsigned int>(block_indices_[block_index].first);
+
+      unsigned int const * L_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks_[block_index].handle1());
+      unsigned int const * L_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks_[block_index].handle2());
+      NumericT     const * L_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT    >(L_blocks_[block_index].handle());
+
+
+      // write L_trans:
+      for (vcl_size_t row = 0; row < L_blocks_[block_index].size1(); ++row)
+      {
+        unsigned int col_start = L_row_buffer[row];
+        unsigned int col_end   = L_row_buffer[row+1];
+
+        for (unsigned int j = col_start; j < col_end; ++j)
+        {
+          unsigned int col = L_col_buffer[j];
+          if (col < row)
+          {
+            unsigned int row_trans = col + block_start;
+            unsigned int k = L_trans_row_buffer[row_trans] + offset_L[row_trans];
+            offset_L[row_trans] += 1;
+
+            L_trans_col_buffer[k] = static_cast<unsigned int>(row) + block_start;
+            L_trans_elements[k]   = L_elements[j];
+          }
+        }
+      }
+
+      unsigned int const * U_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_blocks_[block_index].handle1());
+      unsigned int const * U_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_blocks_[block_index].handle2());
+      NumericT     const * U_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT    >(U_blocks_[block_index].handle());
+
+      // write U_trans and D:
+      for (vcl_size_t row = 0; row < U_blocks_[block_index].size1(); ++row)
+      {
+        unsigned int col_start = U_row_buffer[row];
+        unsigned int col_end   = U_row_buffer[row+1];
+
+        for (unsigned int j = col_start; j < col_end; ++j)
+        {
+          unsigned int row_trans = U_col_buffer[j] + block_start;
+          unsigned int k = U_trans_row_buffer[row_trans] + offset_U[row_trans];
+
+          if (row_trans == row + block_start) // entry for D
+          {
+            D_elements[row_trans] = U_elements[j];
+          }
+          else if (row_trans > row + block_start) //entry for U
+          {
+            offset_U[row_trans] += 1;
+
+            U_trans_col_buffer[k] = static_cast<unsigned int>(row) + block_start;
+            U_trans_elements[k]   = U_elements[j];
+          }
+        }
+      }
+
+    }
+
+    //
+    // Send to destination device:
+    //
+    viennacl::switch_memory_context(gpu_L_trans_, viennacl::traits::context(A));
+    viennacl::switch_memory_context(gpu_U_trans_, viennacl::traits::context(A));
+    viennacl::switch_memory_context(gpu_D_,       viennacl::traits::context(A));
+  }
+
+  void init_dispatch(viennacl::compressed_matrix<NumericT> const & mat_block,
+                     viennacl::compressed_matrix<NumericT> & L,
+                     viennacl::compressed_matrix<NumericT> & U,
+                     viennacl::linalg::ilu0_tag)
+  {
+    L = mat_block;
+    viennacl::linalg::precondition(L, tag_);
+    U = L; // fairly poor workaround...
+  }
+
+  void init_dispatch(viennacl::compressed_matrix<NumericT> const & mat_block,
+                     viennacl::compressed_matrix<NumericT> & L,
+                     viennacl::compressed_matrix<NumericT> & U,
+                     viennacl::linalg::ilut_tag)
+  {
+    L.resize(mat_block.size1(), mat_block.size2());
+    U.resize(mat_block.size1(), mat_block.size2());
+    viennacl::linalg::precondition(mat_block, L, U, tag_);
+  }
+
+
+  ILUTagT                               tag_;
+  index_vector_type                     block_indices_;
+  viennacl::backend::mem_handle         gpu_block_indices_;
+  viennacl::compressed_matrix<NumericT> gpu_L_trans_;
+  viennacl::compressed_matrix<NumericT> gpu_U_trans_;
+  viennacl::vector<NumericT>            gpu_D_;
+
+  std::vector<MatrixType> L_blocks_;
+  std::vector<MatrixType> U_blocks_;
+};
+
+
+}
+}
+
+
+
+
+#endif
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/chow_patel_ilu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/chow_patel_ilu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/chow_patel_ilu.hpp
new file mode 100644
index 0000000..7628cdb
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/chow_patel_ilu.hpp
@@ -0,0 +1,316 @@
+#ifndef VIENNACL_LINALG_DETAIL_CHOW_PATEL_ILU_HPP_
+#define VIENNACL_LINALG_DETAIL_CHOW_PATEL_ILU_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/chow_patel_ilu.hpp
+  @brief Implementations of incomplete factorization preconditioners with fine-grained parallelism.
+
+  Based on "Fine-Grained Parallel Incomplete LU Factorization" by Chow and Patel, SIAM J. Sci. Comput., vol. 37, no. 2, pp. C169-C193
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/linalg/ilu_operations.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/backend/memory.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for incomplete LU and incomplete Cholesky factorization with static pattern (Parallel-ILU0, Parallel ICC0)
+*/
+class chow_patel_tag
+{
+public:
+  /** @brief Constructor allowing to set the number of sweeps and Jacobi iterations.
+    *
+    * @param num_sweeps        Number of sweeps in setup phase
+    * @param num_jacobi_iters  Number of Jacobi iterations for each triangular 'solve' when applying the preconditioner to a vector
+    */
+  chow_patel_tag(vcl_size_t num_sweeps = 3, vcl_size_t num_jacobi_iters = 2) : sweeps_(num_sweeps), jacobi_iters_(num_jacobi_iters) {}
+
+  /** @brief Returns the number of sweeps (i.e. number of nonlinear iterations) in the solver setup stage */
+  vcl_size_t sweeps() const { return sweeps_; }
+  /** @brief Sets the number of sweeps (i.e. number of nonlinear iterations) in the solver setup stage */
+  void       sweeps(vcl_size_t num) { sweeps_ = num; }
+
+  /** @brief Returns the number of Jacobi iterations (i.e. applications of x_{k+1} = (I - D^{-1}R)x_k + D^{-1} b) for each of the solves y = U^{-1} x and z = L^{-1} y) for each preconditioner application. */
+  vcl_size_t jacobi_iters() const { return jacobi_iters_; }
+  /** @brief Sets the number of Jacobi iterations for each triangular 'solve' when applying the preconditioner to a vector. */
+  void       jacobi_iters(vcl_size_t num) { jacobi_iters_ = num; }
+
+private:
+  vcl_size_t sweeps_;
+  vcl_size_t jacobi_iters_;
+};
+
+namespace detail
+{
+  /** @brief Implementation of the parallel ICC0 factorization, Algorithm 3 in Chow-Patel paper.
+   *
+   *  Rather than dealing with a column-major upper triangular matrix U, we use the lower-triangular matrix L such that A is approximately given by LL^T.
+   *  The advantage is that L is readily available in row-major format.
+   */
+  template<typename NumericT>
+  void precondition(viennacl::compressed_matrix<NumericT> const & A,
+                    viennacl::compressed_matrix<NumericT>       & L,
+                    viennacl::vector<NumericT>                  & diag_L,
+                    viennacl::compressed_matrix<NumericT>       & L_trans,
+                    chow_patel_tag const & tag)
+  {
+    // make sure L and U have correct dimensions:
+    L.resize(A.size1(), A.size2(), false);
+
+    // initialize L and U from values in A:
+    viennacl::linalg::extract_L(A, L);
+
+    // diagonally scale values from A in L:
+    viennacl::linalg::icc_scale(A, L);
+
+    viennacl::vector<NumericT> aij_L(L.nnz(), viennacl::traits::context(A));
+    viennacl::backend::memory_copy(L.handle(), aij_L.handle(), 0, 0, sizeof(NumericT) * L.nnz());
+
+    // run sweeps:
+    for (vcl_size_t i=0; i<tag.sweeps(); ++i)
+      viennacl::linalg::icc_chow_patel_sweep(L, aij_L);
+
+    // transpose L to obtain L_trans:
+    viennacl::linalg::ilu_transpose(L, L_trans);
+
+    // form (I - D_L^{-1}L) and (I - D_U^{-1} U), with U := L_trans
+    viennacl::linalg::ilu_form_neumann_matrix(L,       diag_L);
+    viennacl::linalg::ilu_form_neumann_matrix(L_trans, diag_L);
+  }
+
+
+  /** @brief Implementation of the parallel ILU0 factorization, Algorithm 2 in Chow-Patel paper. */
+  template<typename NumericT>
+  void precondition(viennacl::compressed_matrix<NumericT> const & A,
+                    viennacl::compressed_matrix<NumericT>       & L,
+                    viennacl::vector<NumericT>                  & diag_L,
+                    viennacl::compressed_matrix<NumericT>       & U,
+                    viennacl::vector<NumericT>                  & diag_U,
+                    chow_patel_tag const & tag)
+  {
+    // make sure L and U have correct dimensions:
+    L.resize(A.size1(), A.size2(), false);
+    U.resize(A.size1(), A.size2(), false);
+
+    // initialize L and U from values in A:
+    viennacl::linalg::extract_LU(A, L, U);
+
+    // diagonally scale values from A in L and U:
+    viennacl::linalg::ilu_scale(A, L, U);
+
+    // transpose storage layout of U from CSR to CSC via transposition
+    viennacl::compressed_matrix<NumericT> U_trans;
+    viennacl::linalg::ilu_transpose(U, U_trans);
+
+    // keep entries of a_ij for the sweeps
+    viennacl::vector<NumericT> aij_L      (L.nnz(),       viennacl::traits::context(A));
+    viennacl::vector<NumericT> aij_U_trans(U_trans.nnz(), viennacl::traits::context(A));
+
+    viennacl::backend::memory_copy(      L.handle(), aij_L.handle(),       0, 0, sizeof(NumericT) * L.nnz());
+    viennacl::backend::memory_copy(U_trans.handle(), aij_U_trans.handle(), 0, 0, sizeof(NumericT) * U_trans.nnz());
+
+    // run sweeps:
+    for (vcl_size_t i=0; i<tag.sweeps(); ++i)
+      viennacl::linalg::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans);
+
+    // transpose U_trans back:
+    viennacl::linalg::ilu_transpose(U_trans, U);
+
+    // form (I - D_L^{-1}L) and (I - D_U^{-1} U)
+    viennacl::linalg::ilu_form_neumann_matrix(L, diag_L);
+    viennacl::linalg::ilu_form_neumann_matrix(U, diag_U);
+  }
+
+}
+
+
+
+
+/** @brief Parallel Chow-Patel ILU preconditioner class, can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class chow_patel_icc_precond
+{
+  // only works with compressed_matrix!
+  typedef typename MatrixT::CHOW_PATEL_ICC_ONLY_WORKS_WITH_COMPRESSED_MATRIX  error_type;
+};
+
+
+/** @brief Parallel Chow-Patel ILU preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class chow_patel_icc_precond< viennacl::compressed_matrix<NumericT, AlignmentV> >
+{
+
+public:
+  chow_patel_icc_precond(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, chow_patel_tag const & tag)
+    : tag_(tag),
+      L_(0, 0, 0, viennacl::traits::context(A)),
+      diag_L_(A.size1(), viennacl::traits::context(A)),
+      L_trans_(0, 0, 0, viennacl::traits::context(A)),
+      x_k_(A.size1(), viennacl::traits::context(A)),
+      b_(A.size1(), viennacl::traits::context(A))
+  {
+    viennacl::linalg::detail::precondition(A, L_, diag_L_, L_trans_, tag_);
+  }
+
+  /** @brief Preconditioner application: LL^Tx = b, computed via Ly = b, L^Tx = y using Jacobi iterations.
+    *
+    * L contains (I - D_L^{-1}L), L_trans contains (I - D_L^{-1}L^T) where D denotes the respective diagonal matrix
+    */
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    //
+    // y = L^{-1} b through Jacobi iteration y_{k+1} = (I - D^{-1}L)y_k + D^{-1}x
+    //
+    b_ = viennacl::linalg::element_div(vec, diag_L_);
+    x_k_ = b_;
+    for (unsigned int i=0; i<tag_.jacobi_iters(); ++i)
+    {
+      vec = viennacl::linalg::prod(L_, x_k_);
+      x_k_ = vec + b_;
+    }
+
+    //
+    // x = U^{-1} y through Jacobi iteration x_{k+1} = (I - D^{-1}L^T)x_k + D^{-1}b
+    //
+    b_ = viennacl::linalg::element_div(x_k_, diag_L_);
+    x_k_ = b_; // x_1 if x_0 \equiv 0
+    for (unsigned int i=0; i<tag_.jacobi_iters(); ++i)
+    {
+      vec = viennacl::linalg::prod(L_trans_, x_k_);
+      x_k_ = vec + b_;
+    }
+
+    // return result:
+    vec = x_k_;
+  }
+
+private:
+  chow_patel_tag                          tag_;
+  viennacl::compressed_matrix<NumericT>   L_;
+  viennacl::vector<NumericT>              diag_L_;
+  viennacl::compressed_matrix<NumericT>   L_trans_;
+
+  mutable viennacl::vector<NumericT>      x_k_;
+  mutable viennacl::vector<NumericT>      b_;
+};
+
+
+
+
+
+
+/** @brief Parallel Chow-Patel ILU preconditioner class, can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class chow_patel_ilu_precond
+{
+  // only works with compressed_matrix!
+  typedef typename MatrixT::CHOW_PATEL_ILU_ONLY_WORKS_WITH_COMPRESSED_MATRIX  error_type;
+};
+
+
+/** @brief Parallel Chow-Patel ILU preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class chow_patel_ilu_precond< viennacl::compressed_matrix<NumericT, AlignmentV> >
+{
+
+public:
+  chow_patel_ilu_precond(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, chow_patel_tag const & tag)
+    : tag_(tag),
+      L_(0, 0, 0, viennacl::traits::context(A)),
+      diag_L_(A.size1(), viennacl::traits::context(A)),
+      U_(0, 0, 0, viennacl::traits::context(A)),
+      diag_U_(A.size1(), viennacl::traits::context(A)),
+      x_k_(A.size1(), viennacl::traits::context(A)),
+      b_(A.size1(), viennacl::traits::context(A))
+  {
+    viennacl::linalg::detail::precondition(A, L_, diag_L_, U_, diag_U_, tag_);
+  }
+
+  /** @brief Preconditioner application: LUx = b, computed via Ly = b, Ux = y using Jacobi iterations.
+    *
+    * L_ contains (I - D_L^{-1}L), U_ contains (I - D_U^{-1}U) where D denotes the respective diagonal matrix
+    */
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    //
+    // y = L^{-1} b through Jacobi iteration y_{k+1} = (I - D^{-1}L)y_k + D^{-1}x
+    //
+    b_ = viennacl::linalg::element_div(vec, diag_L_);
+    x_k_ = b_;
+    for (unsigned int i=0; i<tag_.jacobi_iters(); ++i)
+    {
+      vec = viennacl::linalg::prod(L_, x_k_);
+      x_k_ = vec + b_;
+    }
+
+    //
+    // x = U^{-1} y through Jacobi iteration x_{k+1} = (I - D^{-1}U)x_k + D^{-1}b
+    //
+    b_ = viennacl::linalg::element_div(x_k_, diag_U_);
+    x_k_ = b_; // x_1 if x_0 \equiv 0
+    for (unsigned int i=0; i<tag_.jacobi_iters(); ++i)
+    {
+      vec = viennacl::linalg::prod(U_, x_k_);
+      x_k_ = vec + b_;
+    }
+
+    // return result:
+    vec = x_k_;
+  }
+
+private:
+  chow_patel_tag                          tag_;
+  viennacl::compressed_matrix<NumericT>   L_;
+  viennacl::vector<NumericT>              diag_L_;
+  viennacl::compressed_matrix<NumericT>   U_;
+  viennacl::vector<NumericT>              diag_U_;
+
+  mutable viennacl::vector<NumericT>      x_k_;
+  mutable viennacl::vector<NumericT>      b_;
+};
+
+
+} // namespace linalg
+} // namespace viennacl
+
+
+#endif
+
+
+


[25/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/qr.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/qr.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/qr.hpp
new file mode 100644
index 0000000..907eb57
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/qr.hpp
@@ -0,0 +1,497 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_QR_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_QR_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/spai/qr.hpp
+    @brief Implementation of a simultaneous QR factorization of multiple matrices. Experimental.
+
+    SPAI code contributed by Nikolay Lukash
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <cmath>
+#include <sstream>
+#include "viennacl/ocl/backend.hpp"
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+#include "boost/numeric/ublas/detail/matrix_assign.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+#include "viennacl/linalg/detail/spai/block_matrix.hpp"
+#include "viennacl/linalg/detail/spai/block_vector.hpp"
+#include "viennacl/linalg/opencl/kernels/spai.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+//********** DEBUG FUNCTIONS *****************//
+template< typename T, typename InputIteratorT>
+void Print(std::ostream & ostr, InputIteratorT it_begin, InputIteratorT it_end)
+{
+  //std::ostream_iterator<int> it_os(ostr, delimiter);
+  std::string delimiters = " ";
+  std::copy(it_begin, it_end, std::ostream_iterator<T>(ostr, delimiters.c_str()));
+  ostr << std::endl;
+}
+
+template<typename VectorT, typename MatrixT>
+void write_to_block(VectorT & con_A_I_J,
+                    unsigned int start_ind,
+                    std::vector<unsigned int> const & I,
+                    std::vector<unsigned int> const & J,
+                    MatrixT& m)
+{
+  m.resize(I.size(), J.size(), false);
+  for (vcl_size_t i = 0; i < J.size(); ++i)
+    for (vcl_size_t j = 0; j < I.size(); ++j)
+      m(j,i) = con_A_I_J[start_ind + i*I.size() + j];
+}
+
+template<typename VectorT>
+void print_continious_matrix(VectorT & con_A_I_J,
+                             std::vector<cl_uint> & blocks_ind,
+                             std::vector<std::vector<unsigned int> > const & g_I,
+                             std::vector<std::vector<unsigned int> > const & g_J)
+{
+  typedef typename VectorT::value_type        NumericType;
+
+  std::vector<boost::numeric::ublas::matrix<NumericType> > com_A_I_J(g_I.size());
+  for (vcl_size_t i = 0; i < g_I.size(); ++i)
+  {
+    write_to_block(con_A_I_J, blocks_ind[i], g_I[i], g_J[i], com_A_I_J[i]);
+    std::cout << com_A_I_J[i] << std::endl;
+  }
+}
+
+template<typename VectorT>
+void print_continious_vector(VectorT & con_v,
+                             std::vector<cl_uint> & block_ind,
+                             std::vector<std::vector<unsigned int> > const & g_J)
+{
+  typedef typename VectorT::value_type     NumericType;
+
+  std::vector<boost::numeric::ublas::vector<NumericType> > com_v(g_J.size());
+  //Print<ScalarType>(std::cout, con_v.begin(), con_v.end());
+  for (vcl_size_t i = 0; i < g_J.size(); ++i)
+  {
+    com_v[i].resize(g_J[i].size());
+    for (vcl_size_t j = 0; j < g_J[i].size(); ++j)
+      com_v[i](j) = con_v[block_ind[i] + j];
+    std::cout << com_v[i] << std::endl;
+  }
+}
+
+
+///**************************************** BLOCK FUNCTIONS ************************************//
+
+/** @brief Computes size of elements, start indices and matrix dimensions for a certain block
+ *
+ * @param g_I         container of row indices
+ * @param g_J         container of column indices
+ * @param sz          general size for all elements in a certain block
+ * @param blocks_ind  start indices in a certain
+ * @param matrix_dims matrix dimensions for each block
+ */
+inline void compute_blocks_size(std::vector<std::vector<unsigned int> > const & g_I,
+                                std::vector<std::vector<unsigned int> > const & g_J,
+                                unsigned int& sz,
+                                std::vector<cl_uint> & blocks_ind,
+                                std::vector<cl_uint> & matrix_dims)
+{
+  sz = 0;
+  for (vcl_size_t i = 0; i < g_I.size(); ++i)
+  {
+    sz += static_cast<unsigned int>(g_I[i].size()*g_J[i].size());
+    matrix_dims[2*i] = static_cast<cl_uint>(g_I[i].size());
+    matrix_dims[2*i + 1] = static_cast<cl_uint>(g_J[i].size());
+    blocks_ind[i+1] = blocks_ind[i] + static_cast<cl_uint>(g_I[i].size()*g_J[i].size());
+  }
+}
+
+/** @brief Computes size of particular container of index set
+ *
+ * @param inds   container of index sets
+ * @param size   output size
+ */
+template<typename SizeT>
+void get_size(std::vector<std::vector<SizeT> > const & inds,
+              SizeT & size)
+{
+  size = 0;
+  for (vcl_size_t i = 0; i < inds.size(); ++i)
+    size += static_cast<unsigned int>(inds[i].size());
+}
+
+/** @brief Initializes start indices of particular index set
+ *
+ * @param inds         container of index sets
+ * @param start_inds   output index set
+ */
+template<typename SizeT>
+void init_start_inds(std::vector<std::vector<SizeT> > const & inds,
+                     std::vector<cl_uint>& start_inds)
+{
+  for (vcl_size_t i = 0; i < inds.size(); ++i)
+    start_inds[i+1] = start_inds[i] + static_cast<cl_uint>(inds[i].size());
+}
+
+
+//*************************************  QR FUNCTIONS  ***************************************//
+
+/** @brief Dot prod of particular column of martix A with it's self starting at a certain index beg_ind
+ *
+ * @param A        init matrix
+ * @param beg_ind  starting index
+ * @param res      result of dot product
+ */
+template<typename MatrixT, typename NumericT>
+void dot_prod(MatrixT const & A,
+              unsigned int beg_ind,
+              NumericT & res)
+{
+  res = NumericT(0);
+  for (vcl_size_t i = beg_ind; i < A.size1(); ++i)
+    res += A(i, beg_ind-1)*A(i, beg_ind-1);
+}
+
+/** @brief Dot prod of particular matrix column with arbitrary vector: A(:, col_ind)
+ *
+ * @param A           init matrix
+ * @param v           input vector
+ * @param col_ind     starting column index
+ * @param start_ind   starting index inside column
+ * @param res         result of dot product
+ */
+template<typename MatrixT, typename VectorT, typename NumericT>
+void custom_inner_prod(MatrixT const & A,
+                       VectorT const & v,
+                       unsigned int col_ind,
+                       unsigned int start_ind,
+                       NumericT & res)
+{
+  res = static_cast<NumericT>(0);
+  for (unsigned int i = start_ind; i < static_cast<unsigned int>(A.size1()); ++i)
+    res += A(i, col_ind)*v(i);
+}
+
+/** @brief Copying part of matrix column
+ *
+ * @param A         init matrix
+ * @param v         output vector
+ * @param beg_ind   start index for copying
+ */
+template<typename MatrixT, typename VectorT>
+void copy_vector(MatrixT const & A,
+                 VectorT       & v,
+                 unsigned int beg_ind)
+{
+  for (unsigned int i = beg_ind; i < static_cast<unsigned int>(A.size1()); ++i)
+    v(i) = A( i, beg_ind-1);
+}
+
+
+//householder reflection c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.210
+/** @brief Computation of Householder vector, householder reflection c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.210
+ *
+ * @param A     init matrix
+ * @param j     start index for computations
+ * @param v     output Householder vector
+ * @param b     beta
+ */
+template<typename MatrixT, typename VectorT, typename NumericT>
+void householder_vector(MatrixT const & A,
+                        unsigned int j,
+                        VectorT & v,
+                        NumericT & b)
+{
+  NumericT sg;
+
+  dot_prod(A, j+1, sg);
+  copy_vector(A, v, j+1);
+  NumericT mu;
+  v(j) = static_cast<NumericT>(1.0);
+  if (!sg)
+    b = 0;
+  else
+  {
+    mu = std::sqrt(A(j,j)*A(j, j) + sg);
+    if (A(j, j) <= 0)
+      v(j) = A(j, j) - mu;
+    else
+      v(j) = -sg/(A(j, j) + mu);
+
+    b = 2*(v(j)*v(j))/(sg + v(j)*v(j));
+    v = v/v(j);
+  }
+}
+
+
+/** @brief Inplace application of Householder vector to a matrix A
+ *
+ * @param A          init matrix
+ * @param iter_cnt   current iteration
+ * @param v          Householder vector
+ * @param b          beta
+ */
+template<typename MatrixT, typename VectorT, typename NumericT>
+void apply_householder_reflection(MatrixT & A,
+                                  unsigned int iter_cnt,
+                                  VectorT & v,
+                                  NumericT b)
+{
+  //update every column of matrix A
+  NumericT in_prod_res;
+
+  for (unsigned int i = iter_cnt; i < static_cast<unsigned int>(A.size2()); ++i)
+  {
+    //update each column in a fashion: ai = ai - b*v*(v'*ai)
+    custom_inner_prod(A, v, i, iter_cnt, in_prod_res);
+    for (unsigned int j = iter_cnt; j < static_cast<unsigned int>(A.size1()); ++j)
+      A(j, i) -= b*in_prod_res*v(j);
+  }
+}
+
+/** @brief Storage of vector v in column(A, ind), starting from ind-1 index of a column
+ *
+ * @param A     init matrix
+ * @param ind   index of a column
+ * @param v     vector that should be stored
+ */
+template<typename MatrixT, typename VectorT>
+void store_householder_vector(MatrixT & A,
+                              unsigned int ind,
+                              VectorT & v)
+{
+  for (unsigned int i = ind; i < static_cast<unsigned int>(A.size1()); ++i)
+    A(i, ind-1) = v(i);
+}
+
+
+//QR algorithm
+/** @brief Inplace QR factorization via Householder reflections c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.224
+ *
+ * @param R     input matrix
+ * @param b_v   vector of betas
+ */
+template<typename MatrixT, typename VectorT>
+void single_qr(MatrixT & R, VectorT & b_v)
+{
+  typedef typename MatrixT::value_type     NumericType;
+
+  if ((R.size1() > 0) && (R.size2() > 0))
+  {
+    VectorT v = static_cast<VectorT>(boost::numeric::ublas::zero_vector<NumericType>(R.size1()));
+    b_v = static_cast<VectorT>(boost::numeric::ublas::zero_vector<NumericType>(R.size2()));
+
+    for (unsigned int i = 0; i < static_cast<unsigned int>(R.size2()); ++i)
+    {
+      householder_vector(R, i, v, b_v[i]);
+      apply_householder_reflection(R, i, v, b_v[i]);
+      if (i < R.size1())
+        store_householder_vector(R, i+1, v);
+    }
+  }
+}
+
+//********************** HELP FUNCTIONS FOR GPU-based QR factorization *************************//
+
+/** @brief Getting max size of rows/columns from container of index set
+ *
+ * @param inds        container of index set
+ * @param max_size    max size that corresponds to that container
+ */
+template<typename SizeT>
+void get_max_block_size(std::vector<std::vector<SizeT> > const & inds,
+                        SizeT & max_size)
+{
+  max_size = 0;
+  for (vcl_size_t i = 0; i < inds.size(); ++i)
+    if (inds[i].size() > max_size)
+      max_size = static_cast<SizeT>(inds[i].size());
+}
+
+/** @brief Dot_prod(column(A, ind), v) starting from index ind+1
+ *
+ * @param A      input matrix
+ * @param v      input vector
+ * @param ind    index
+ * @param res    result value
+ */
+template<typename MatrixT, typename VectorT, typename NumericT>
+void custom_dot_prod(MatrixT const & A,
+                     VectorT const & v,
+                     unsigned int ind,
+                     NumericT & res)
+{
+  res = static_cast<NumericT>(0);
+  for (unsigned int j = ind; j < A.size1(); ++j)
+  {
+    if (j == ind)
+      res += v(j);
+    else
+      res += A(j, ind)*v(j);
+  }
+}
+
+/** @brief Recovery Q from matrix R and vector of betas b_v
+ *
+ * @param R      input matrix
+ * @param b_v    vector of betas
+ * @param y      output vector
+ */
+template<typename MatrixT, typename VectorT>
+void apply_q_trans_vec(MatrixT const & R,
+                       VectorT const & b_v,
+                       VectorT       & y)
+{
+  typedef typename MatrixT::value_type     NumericT;
+
+  NumericT inn_prod = NumericT(0);
+  for (vcl_size_t i = 0; i < R.size2(); ++i)
+  {
+    custom_dot_prod(R, y, static_cast<unsigned int>(i), inn_prod);
+    for (vcl_size_t j = i; j < R.size1(); ++j)
+    {
+      if (i == j)
+        y(j) -= b_v(i)*inn_prod;
+      else
+        y(j) -= b_v(i)*inn_prod*R(j,i);
+    }
+  }
+}
+
+/** @brief Multiplication of Q'*A, where Q is in implicit for lower part of R and vector of betas - b_v
+ *
+ * @param R      input matrix
+ * @param b_v    vector of betas
+ * @param A      output matrix
+ */
+template<typename MatrixT, typename VectorT>
+void apply_q_trans_mat(MatrixT const & R,
+                       VectorT const & b_v,
+                       MatrixT       & A)
+{
+  VectorT tmp_v;
+  for (vcl_size_t i = 0; i < A.size2(); ++i)
+  {
+    tmp_v = static_cast<VectorT>(column(A,i));
+    apply_q_trans_vec(R, b_v, tmp_v);
+    column(A,i) = tmp_v;
+  }
+}
+
+//parallel QR for GPU
+/** @brief Inplace QR factorization via Householder reflections c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.224 performed on GPU
+ *
+ * @param g_I         container of row indices
+ * @param g_J         container of column indices
+ * @param g_A_I_J_vcl contigious matrices, GPU memory is used
+ * @param g_bv_vcl    contigiuos vectors beta, GPU memory is used
+ * @param g_is_update container of indicators that show active blocks
+ * @param ctx         Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
+ */
+template<typename NumericT>
+void block_qr(std::vector<std::vector<unsigned int> > & g_I,
+              std::vector<std::vector<unsigned int> > & g_J,
+              block_matrix & g_A_I_J_vcl,
+              block_vector & g_bv_vcl,
+              std::vector<cl_uint> & g_is_update,
+              viennacl::context ctx)
+{
+  viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+
+  //typedef typename MatrixType::value_type ScalarType;
+  unsigned int bv_size = 0;
+  unsigned int v_size = 0;
+  //set up arguments for GPU
+  //find maximum size of rows/columns
+  unsigned int local_r_n = 0;
+  unsigned int local_c_n = 0;
+  //find max size for blocks
+  get_max_block_size(g_I, local_r_n);
+  get_max_block_size(g_J, local_c_n);
+  //get size
+  get_size(g_J, bv_size);
+  get_size(g_I, v_size);
+  //get start indices
+  std::vector<cl_uint> start_bv_inds(g_I.size() + 1, 0);
+  std::vector<cl_uint> start_v_inds(g_I.size() + 1, 0);
+  init_start_inds(g_J, start_bv_inds);
+  init_start_inds(g_I, start_v_inds);
+  //init arrays
+  std::vector<NumericT> b_v(bv_size, NumericT(0));
+  std::vector<NumericT>   v(v_size,  NumericT(0));
+  //call qr program
+  block_vector v_vcl;
+
+  g_bv_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                               static_cast<unsigned int>(sizeof(NumericT)*bv_size),
+                                               &(b_v[0]));
+
+  v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                            static_cast<unsigned int>(sizeof(NumericT)*v_size),
+                                            &(v[0]));
+  //the same as j_start_inds
+  g_bv_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()),
+                                                &(start_bv_inds[0]));
+
+  v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                             static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()),
+                                             &(start_v_inds[0]));
+  viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                           static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
+                                                                           &(g_is_update[0]));
+  //local memory
+  //viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp));
+  viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+  viennacl::ocl::kernel & qr_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr");
+
+  qr_kernel.local_work_size(0, local_c_n);
+  qr_kernel.global_work_size(0, local_c_n*256);
+  viennacl::ocl::enqueue(qr_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle1(), g_bv_vcl.handle(),
+                                  v_vcl.handle(), g_A_I_J_vcl.handle2(),
+                                  g_bv_vcl.handle1(), v_vcl.handle1(), g_is_update_vcl,
+                                  viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(NumericT)*(local_r_n*local_c_n))),
+                                  static_cast<cl_uint>(g_I.size())));
+
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/small_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/small_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/small_matrix.hpp
new file mode 100644
index 0000000..3cfdbb3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/small_matrix.hpp
@@ -0,0 +1,113 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SMALL_MATRIX_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SMALL_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/spai/small_matrix.hpp
+    @brief Implementation of a routines for small matrices (helper for SPAI). Experimental.
+
+    SPAI code contributed by Nikolay Lukash
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/lu.hpp"
+#include "boost/numeric/ublas/triangular.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+#include "boost/numeric/ublas/detail/matrix_assign.hpp"
+
+#include "viennacl/forwards.h"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+//
+// Constructs an orthonormal sparse matrix M (with M^T M = Id). Is composed of elementary 2x2 rotation matrices with suitable renumbering.
+//
+template<typename MatrixT>
+void make_rotation_matrix(MatrixT & mat,
+                          vcl_size_t new_size,
+                          vcl_size_t off_diagonal_distance = 4)
+{
+  mat.resize(new_size, new_size, false);
+  mat.clear();
+
+  double val = 1.0 / std::sqrt(2.0);
+
+  for (vcl_size_t i=0; i<new_size; ++i)
+    mat(i,i) = val;
+
+  for (vcl_size_t i=off_diagonal_distance; i<new_size; ++i)
+  {
+    mat(i-off_diagonal_distance, i)                       = val;
+    mat(i,                       i-off_diagonal_distance) = -val;
+  }
+
+}
+
+
+//calcualtes matrix determinant
+template<typename MatrixT>
+double determinant(boost::numeric::ublas::matrix_expression<MatrixT> const & mat_r)
+{
+  double det = 1.0;
+
+  MatrixT mLu(mat_r());
+  boost::numeric::ublas::permutation_matrix<vcl_size_t> pivots(mat_r().size1());
+
+  int is_singular = static_cast<int>(lu_factorize(mLu, pivots));
+
+  if (!is_singular)
+  {
+    for (vcl_size_t i=0; i < pivots.size(); ++i)
+    {
+      if (pivots(i) != i)
+        det *= -1.0;
+
+      det *= mLu(i,i);
+    }
+  }
+  else
+    det = 0.0;
+
+  return det;
+}
+
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-dynamic.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-dynamic.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-dynamic.hpp
new file mode 100644
index 0000000..bac0b9e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-dynamic.hpp
@@ -0,0 +1,687 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/spai/spai-dynamic.hpp
+    @brief Implementation of a dynamic SPAI. Provides the routines for automatic pattern updates Experimental.
+
+    SPAI code contributed by Nikolay Lukash
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+//#include "block_matrix.hpp"
+//#include "block_vector.hpp"
+//#include "benchmark-utils.hpp"
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/lu.hpp"
+#include "boost/numeric/ublas/triangular.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+// ViennaCL includes
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/ocl/backend.hpp"
+
+#include "viennacl/linalg/detail/spai/block_matrix.hpp"
+#include "viennacl/linalg/detail/spai/block_vector.hpp"
+#include "viennacl/linalg/detail/spai/qr.hpp"
+#include "viennacl/linalg/detail/spai/spai-static.hpp"
+#include "viennacl/linalg/detail/spai/spai.hpp"
+#include "viennacl/linalg/detail/spai/spai_tag.hpp"
+#include "viennacl/linalg/opencl/kernels/spai.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/** @brief Helper functor for comparing std::pair<> based on the second member. */
+struct CompareSecond
+{
+  template<typename T1, typename T2>
+  bool operator()(std::pair<T1, T2> const & left, std::pair<T1, T2> const & right)
+  {
+    return static_cast<double>(left.second) > static_cast<double>(right.second);
+  }
+};
+
+
+/** @brief Composition of new matrix R, that is going to be used in Least Square problem solving
+ *
+ * @param A      matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices
+ * @param R_n    matrix A_Iu_J_u after QR factorization
+ * @param R      previously composed matrix R
+ */
+template<typename MatrixT>
+void composeNewR(MatrixT const & A,
+                 MatrixT const & R_n,
+                 MatrixT & R)
+{
+  typedef typename MatrixT::value_type        NumericType;
+
+  vcl_size_t row_n = R_n.size1() - (A.size1() - R.size2());
+  MatrixT C = boost::numeric::ublas::zero_matrix<NumericType>(R.size1() + row_n, R.size2() + A.size2());
+
+  //write original R to new Composite R
+  boost::numeric::ublas::project(C, boost::numeric::ublas::range(0,R.size1()), boost::numeric::ublas::range(0, R.size2())) += R;
+  //write upper part of Q'*A_I_\hatJ, all columns and number of rows that equals to R.size2()
+  boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(R.size2(),
+                                                                                                            R.size2() + A.size2())) +=
+  boost::numeric::ublas::project(A, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(0, A.size2()));
+
+  //adding decomposed(QR) block to Composite R
+  if (R_n.size1() > 0 && R_n.size2() > 0)
+      boost::numeric::ublas::project(C,
+                                     boost::numeric::ublas::range(R.size2(), R.size1() + row_n),
+                                     boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += R_n;
+  R = C;
+}
+
+/** @brief Composition of new vector of coefficients beta from QR factorizations(necessary for Q recovery)
+ *
+ * @param v_n     new vector from last QR factorization
+ * @param v       composition of previous vectors from QR factorizations
+ */
+template<typename VectorT>
+void composeNewVector(VectorT const & v_n,
+                      VectorT       & v)
+{
+  typedef typename VectorT::value_type          NumericType;
+
+  VectorT w  = boost::numeric::ublas::zero_vector<NumericType>(v.size() + v_n.size());
+  boost::numeric::ublas::project(w, boost::numeric::ublas::range(0, v.size())) += v;
+  boost::numeric::ublas::project(w, boost::numeric::ublas::range(v.size(), v.size() + v_n.size())) += v_n;
+  v = w;
+}
+
+/** @brief Computation of Euclidean norm for sparse vector
+ *
+ * @param v      initial sparse vector
+ * @param norm   scalar that represents Euclidean norm
+ */
+template<typename SparseVectorT, typename NumericT>
+void sparse_norm_2(SparseVectorT const & v,
+                   NumericT & norm)
+{
+  for (typename SparseVectorT::const_iterator vec_it  = v.begin(); vec_it != v.end(); ++vec_it)
+    norm += (vec_it->second)*(vec_it->second);
+
+  norm = std::sqrt(norm);
+}
+
+/** @brief Dot product of two sparse vectors
+ *
+ * @param v1     initial sparse vector
+ * @param v2     initial sparse vector
+ * @param res_v  scalar that represents dot product result
+ */
+template<typename SparseVectorT, typename NumericT>
+void sparse_inner_prod(SparseVectorT const & v1,
+                       SparseVectorT const & v2,
+                       NumericT & res_v)
+{
+  typename SparseVectorT::const_iterator v_it1 = v1.begin();
+  typename SparseVectorT::const_iterator v_it2 = v2.begin();
+
+  while ((v_it1 != v1.end())&&(v_it2 != v2.end()))
+  {
+    if (v_it1->first == v_it2->first)
+    {
+      res_v += (v_it1->second)*(v_it2->second);
+      ++v_it1;
+      ++v_it2;
+    }
+    else if (v_it1->first < v_it2->first)
+      ++v_it1;
+    else
+      ++v_it2;
+  }
+}
+
+/** @brief Building a new set of column indices J_u, cf. Kallischko dissertation p.31
+ *
+ * @param A_v_c  vectorized column-wise initial matrix
+ * @param res    residual vector
+ * @param J      set of column indices
+ * @param J_u    set of new column indices
+ * @param tag    SPAI tag with parameters
+ */
+template<typename SparseVectorT, typename NumericT>
+bool buildAugmentedIndexSet(std::vector<SparseVectorT> const & A_v_c,
+                            SparseVectorT const & res,
+                            std::vector<unsigned int> & J,
+                            std::vector<unsigned int> & J_u,
+                            spai_tag const & tag)
+{
+  std::vector<std::pair<unsigned int, NumericT> > p;
+  vcl_size_t cur_size = 0;
+  NumericT inprod, norm2;
+
+  for (typename SparseVectorT::const_iterator res_it = res.begin(); res_it != res.end(); ++res_it)
+  {
+    if (!isInIndexSet(J, res_it->first) && (std::fabs(res_it->second) > tag.getResidualThreshold()))
+    {
+      inprod = norm2 = 0;
+      sparse_inner_prod(res, A_v_c[res_it->first], inprod);
+      sparse_norm_2(A_v_c[res_it->first], norm2);
+      p.push_back(std::pair<unsigned int, NumericT>(res_it->first, (inprod*inprod)/(norm2*norm2)));
+    }
+  }
+
+  std::sort(p.begin(), p.end(), CompareSecond());
+  while ((cur_size < J.size()) && (p.size() > 0))
+  {
+    J_u.push_back(p[0].first);
+    p.erase(p.begin());
+    cur_size++;
+  }
+  p.clear();
+  return (cur_size > 0);
+}
+
+/** @brief Building a new indices to current set of row indices I_n, cf. Kallischko dissertation p.32
+ *
+ * @param A_v_c    vectorized column-wise initial matrix
+ * @param I        set of previous determined row indices
+ * @param J_n      set of new column indices
+ * @param I_n      set of new indices
+ */
+template<typename SparseVectorT>
+void buildNewRowSet(std::vector<SparseVectorT> const & A_v_c,
+                    std::vector<unsigned int>  const & I,
+                    std::vector<unsigned int>  const & J_n,
+                    std::vector<unsigned int>        & I_n)
+{
+  for (vcl_size_t i = 0; i < J_n.size(); ++i)
+  {
+    for (typename SparseVectorT::const_iterator col_it = A_v_c[J_n[i]].begin(); col_it!=A_v_c[J_n[i]].end(); ++col_it)
+    {
+      if (!isInIndexSet(I, col_it->first) && !isInIndexSet(I_n, col_it->first))
+        I_n.push_back(col_it->first);
+    }
+  }
+}
+
+/** @brief Composition of new block for QR factorization cf. Kallischko dissertation p.82, figure 4.7
+ *
+ * @param A_I_J       previously composed block
+ * @param A_I_J_u     matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices
+ * @param A_I_u_J_u   is composition of lower part A(I, \\tilde J) and  A(\\tilde I, \\tilde J) - new block for QR decomposition
+ */
+template<typename MatrixT>
+void QRBlockComposition(MatrixT const & A_I_J,
+                        MatrixT const & A_I_J_u,
+                        MatrixT       & A_I_u_J_u)
+{
+  typedef typename MatrixT::value_type     NumericType;
+
+  vcl_size_t row_n1 = A_I_J_u.size1() - A_I_J.size2();
+  vcl_size_t row_n2 = A_I_u_J_u.size1();
+  vcl_size_t row_n = row_n1 + row_n2;
+  vcl_size_t col_n = A_I_J_u.size2();
+
+  MatrixT C = boost::numeric::ublas::zero_matrix<NumericType>(row_n, col_n);
+  boost::numeric::ublas::project(C,
+                                 boost::numeric::ublas::range(0, row_n1),
+                                 boost::numeric::ublas::range(0, col_n))
+  += boost::numeric::ublas::project(A_I_J_u,
+                                    boost::numeric::ublas::range(A_I_J.size2(), A_I_J_u.size1()),
+                                    boost::numeric::ublas::range(0, col_n));
+
+  boost::numeric::ublas::project(C,
+                                 boost::numeric::ublas::range(row_n1, row_n1 + row_n2),
+                                 boost::numeric::ublas::range(0, col_n)) += A_I_u_J_u;
+  A_I_u_J_u = C;
+}
+
+/** @brief CPU-based dynamic update for SPAI preconditioner
+ *
+ * @param A            initial sparse matrix
+ * @param A_v_c        vectorized column-wise initial matrix
+ * @param g_res        container of residuals for all columns
+ * @param g_is_update  container with identificators that shows which block should be modified
+ * @param g_I          container of row index sets for all columns
+ * @param g_J          container of column index sets for all columns
+ * @param g_b_v        container of vectors of beta for Q recovery(cf. Golub Van Loan "Matrix Computations", 3rd edition p.211)
+ * @param g_A_I_J      container of block matrices from previous update
+ * @param tag          SPAI configuration tag
+ */
+template<typename SparseMatrixT,
+         typename SparseVectorT,
+         typename DenseMatrixT,
+         typename VectorT>
+void block_update(SparseMatrixT const & A,
+                  std::vector<SparseVectorT> const & A_v_c,
+                  std::vector<SparseVectorT>       & g_res,
+                  std::vector<bool> & g_is_update,
+                  std::vector<std::vector<unsigned int> >& g_I,
+                  std::vector<std::vector<unsigned int> >& g_J,
+                  std::vector<VectorT>      & g_b_v,
+                  std::vector<DenseMatrixT> & g_A_I_J,
+                  spai_tag const & tag)
+{
+  typedef typename DenseMatrixT::value_type     NumericType;
+
+
+  std::vector<std::vector<unsigned int> > g_J_u(g_J.size());   // set of new column indices
+  std::vector<std::vector<unsigned int> > g_I_u(g_J.size());   // set of new row indices
+  std::vector<DenseMatrixT> g_A_I_J_u(g_J.size());             // matrix A(I, \tilde J), cf. Kallischko p.31-32
+  std::vector<DenseMatrixT> g_A_I_u_J_u(g_J.size());           // matrix A(\tilde I, \tilde J), cf. Kallischko
+  std::vector<VectorT>      g_b_v_u(g_J.size());               // new vector of beta coefficients from QR factorization
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i = 0; i < static_cast<long>(g_J.size()); ++i)
+  {
+    if (g_is_update[static_cast<vcl_size_t>(i)])
+    {
+      if (buildAugmentedIndexSet<SparseVectorT, NumericType>(A_v_c, g_res[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], tag))
+      {
+        //initialize matrix A_I_\hatJ
+        initProjectSubMatrix(A, g_J_u[static_cast<vcl_size_t>(i)], g_I[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)]);
+        //multiplication of Q'*A_I_\hatJ
+        apply_q_trans_mat(g_A_I_J[static_cast<vcl_size_t>(i)], g_b_v[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)]);
+        //building new rows index set \hatI
+        buildNewRowSet(A_v_c, g_I[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)]);
+        initProjectSubMatrix(A, g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)]);
+        //composition of block for new QR factorization
+        QRBlockComposition(g_A_I_J[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)]);
+        //QR factorization
+        single_qr(g_A_I_u_J_u[static_cast<vcl_size_t>(i)], g_b_v_u[static_cast<vcl_size_t>(i)]);
+        //composition of new R and new vector b_v
+        composeNewR(g_A_I_J_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)], g_A_I_J[static_cast<vcl_size_t>(i)]);
+        composeNewVector(g_b_v_u[static_cast<vcl_size_t>(i)], g_b_v[static_cast<vcl_size_t>(i)]);
+        //composition of new sets: I and J
+        g_J[static_cast<vcl_size_t>(i)].insert(g_J[static_cast<vcl_size_t>(i)].end(), g_J_u[static_cast<vcl_size_t>(i)].begin(), g_J_u[static_cast<vcl_size_t>(i)].end());
+        g_I[static_cast<vcl_size_t>(i)].insert(g_I[static_cast<vcl_size_t>(i)].end(), g_I_u[static_cast<vcl_size_t>(i)].begin(), g_I_u[static_cast<vcl_size_t>(i)].end());
+      }
+      else
+      {
+        g_is_update[static_cast<vcl_size_t>(i)] = false;
+      }
+    }
+  }
+}
+
+
+/**************************************************** GPU SPAI Update ****************************************************************/
+
+
+//performs Q'*A(I, \tilde J) on GPU
+/** @brief Performs multiplication Q'*A(I, \\tilde J) on GPU
+ *
+ * @param g_J_u          container of sets of new column indices
+ * @param g_I            container of row indices
+ * @param g_A_I_J_vcl    block matrix composed from previous blocks, they are blocks of R
+ * @param g_bv_vcl       block of beta vectors
+ * @param g_A_I_J_u_vcl  block of matrices A(I, \\tilde J)
+ * @param g_is_update    indicators, that show if a certain block should be processed
+ * @param ctx            Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
+ */
+template<typename NumericT>
+void block_q_multiplication(std::vector<std::vector<unsigned int> > const & g_J_u,
+                            std::vector<std::vector<unsigned int> > const & g_I,
+                            block_matrix & g_A_I_J_vcl,
+                            block_vector & g_bv_vcl,
+                            block_matrix & g_A_I_J_u_vcl,
+                            std::vector<cl_uint> & g_is_update,
+                            viennacl::context ctx)
+{
+  viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+  unsigned int local_r_n = 0;
+  unsigned int local_c_n = 0;
+  unsigned int sz_blocks = 0;
+
+  get_max_block_size(g_I,   local_r_n);
+  get_max_block_size(g_J_u, local_c_n);
+
+  //for debug
+  std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
+  std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+  compute_blocks_size(g_I, g_J_u, sz_blocks, blocks_ind, matrix_dims);
+  //std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
+
+  viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                           static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
+                                                                           &(g_is_update[0]));
+  viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+  viennacl::ocl::kernel& block_q_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_q_mult");
+
+  block_q_kernel.local_work_size(0,      local_c_n);
+  block_q_kernel.global_work_size(0, 128*local_c_n);
+  viennacl::ocl::enqueue(block_q_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
+                                        g_bv_vcl.handle(),
+                                        g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle1(), g_is_update_vcl,
+                                        viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(NumericT)*(local_r_n*local_c_n))),
+                                        static_cast<cl_uint>(g_I.size())));
+}
+
+/** @brief Assembly of container of index row sets: I_q, row indices for new "QR block"
+ *
+ * @param g_I    container of row indices
+ * @param g_J    container of column indices
+ * @param g_I_u  container of new row indices
+ * @param g_I_q  container of row indices for new QR blocks
+ */
+template<typename SizeT>
+void assemble_qr_row_inds(std::vector<std::vector<SizeT> > const & g_I,
+                          std::vector<std::vector<SizeT> > const & g_J,
+                          std::vector<std::vector<SizeT> > const & g_I_u,
+                          std::vector<std::vector<SizeT> >       & g_I_q)
+{
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i = 0; i < static_cast<long>(g_I.size()); ++i)
+  {
+    for (vcl_size_t j = g_J[static_cast<vcl_size_t>(i)].size(); j < g_I[static_cast<vcl_size_t>(i)].size(); ++j)
+      g_I_q[static_cast<vcl_size_t>(i)].push_back(g_I[static_cast<vcl_size_t>(i)][j]);
+
+    for (vcl_size_t j = 0; j < g_I_u[static_cast<vcl_size_t>(i)].size(); ++j)
+      g_I_q[static_cast<vcl_size_t>(i)].push_back(g_I_u[static_cast<vcl_size_t>(i)][j]);
+  }
+}
+
+/** @brief Performs assembly for new QR block
+ *
+ * @param g_J                container of column indices
+ * @param g_I                container of row indices
+ * @param g_J_u              container of new column indices
+ * @param g_I_u              container of new row indices
+ * @param g_I_q              container of row indices for new QR blocks
+ * @param g_A_I_J_u_vcl      blocks of Q'*A(I, \\tilde J)
+ * @param matrix_dimensions  array with matrix dimensions for all blocks
+ * @param g_A_I_u_J_u_vcl    blocks A(\\tilde I, \\tilde J)
+ * @param g_is_update        container with update indicators
+ * @param is_empty_block     indicator if all previous blocks A(\\tilde I, \\tilde J) - are empty, in case if they are empty kernel with smaller number of arguments is used
+ * @param ctx                Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+*/
+template<typename NumericT>
+void assemble_qr_block(std::vector<std::vector<unsigned int> > const & g_J,
+                       std::vector<std::vector<unsigned int> > const& g_I,
+                       std::vector<std::vector<unsigned int> > const& g_J_u,
+                       std::vector<std::vector<unsigned int> > const& g_I_u,
+                       std::vector<std::vector<unsigned int> >& g_I_q,
+                       block_matrix & g_A_I_J_u_vcl,
+                       viennacl::ocl::handle<cl_mem> & matrix_dimensions,
+                       block_matrix & g_A_I_u_J_u_vcl,
+                       std::vector<cl_uint> & g_is_update,
+                       bool is_empty_block,
+                       viennacl::context ctx)
+{
+  viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+
+  //std::vector<std::vector<unsigned int> > g_I_q(g_I.size());
+  assemble_qr_row_inds(g_I, g_J, g_I_u, g_I_q);
+  unsigned int sz_blocks;
+  std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
+  std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+
+  compute_blocks_size(g_I_q, g_J_u, sz_blocks, blocks_ind, matrix_dims);
+
+  std::vector<NumericT> con_A_I_J_q(sz_blocks, static_cast<NumericT>(0));
+
+  block_matrix g_A_I_J_q_vcl;
+  //need to allocate memory for QR block
+  g_A_I_J_q_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                    static_cast<unsigned int>(sizeof(NumericT)*sz_blocks),
+                                                    &(con_A_I_J_q[0]));
+  g_A_I_J_q_vcl.handle().context(opencl_ctx);
+
+  g_A_I_J_q_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                     static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
+                                                     &(matrix_dims[0]));
+  g_A_I_J_q_vcl.handle1().context(opencl_ctx);
+
+  g_A_I_J_q_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                      static_cast<unsigned int>(sizeof(cl_uint)*static_cast<unsigned int>(g_I.size() + 1)),
+                                                      &(blocks_ind[0]));
+  g_A_I_J_q_vcl.handle2().context(opencl_ctx);
+
+  viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                           static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
+                                                                           &(g_is_update[0]));
+
+  viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+  if (!is_empty_block)
+  {
+    viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr_assembly");
+    qr_assembly_kernel.local_work_size(0, 1);
+    qr_assembly_kernel.global_work_size(0, 256);
+    viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions,
+                                              g_A_I_J_u_vcl.handle(),
+                                              g_A_I_J_u_vcl.handle2(),
+                                              g_A_I_J_u_vcl.handle1(),
+                                              g_A_I_u_J_u_vcl.handle(),
+                                              g_A_I_u_J_u_vcl.handle2(),
+                                              g_A_I_u_J_u_vcl.handle1(),
+                                              g_A_I_J_q_vcl.handle(),
+                                              g_A_I_J_q_vcl.handle2(),
+                                              g_A_I_J_q_vcl.handle1(),
+                                              g_is_update_vcl,
+                                              static_cast<unsigned int>(g_I.size())));
+  }
+  else
+  {
+    viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr_assembly_1");
+    qr_assembly_kernel.local_work_size(0, 1);
+    qr_assembly_kernel.global_work_size(0, 256);
+    viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
+                                              g_A_I_J_u_vcl.handle1(),
+                                              g_A_I_J_q_vcl.handle(),
+                                              g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(),
+                                              g_is_update_vcl,
+                                              static_cast<unsigned int>(g_I.size())));
+  }
+  g_A_I_u_J_u_vcl.handle() = g_A_I_J_q_vcl.handle();
+  g_A_I_u_J_u_vcl.handle1() = g_A_I_J_q_vcl.handle1();
+  g_A_I_u_J_u_vcl.handle2() = g_A_I_J_q_vcl.handle2();
+}
+
+/** @brief Performs assembly for new R matrix on GPU
+ *
+ * @param g_I              container of row indices
+ * @param g_J              container of column indices
+ * @param g_A_I_J_vcl      container of block matrices from previous update
+ * @param g_A_I_J_u_vcl    container of block matrices Q'*A(I, \\tilde J)
+ * @param g_A_I_u_J_u_vcl  container of block matrices QR factored on current iteration
+ * @param g_bv_vcl         block of beta vectors from previous iteration
+ * @param g_bv_vcl_u       block of updated beta vectors got after recent QR factorization
+ * @param g_is_update      container with identificators that shows which block should be modified
+ * @param ctx              Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
+ */
+template<typename NumericT>
+void assemble_r(std::vector<std::vector<unsigned int> > & g_I,
+                std::vector<std::vector<unsigned int> > & g_J,
+                block_matrix & g_A_I_J_vcl,
+                block_matrix & g_A_I_J_u_vcl,
+                block_matrix & g_A_I_u_J_u_vcl,
+                block_vector & g_bv_vcl,
+                block_vector & g_bv_vcl_u,
+                std::vector<cl_uint> & g_is_update,
+                viennacl::context ctx)
+{
+  viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+  std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
+  std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+  std::vector<cl_uint> start_bv_r_inds(g_I.size() + 1, 0);
+  unsigned int sz_blocks, bv_size;
+
+  compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
+  get_size(g_J, bv_size);
+  init_start_inds(g_J, start_bv_r_inds);
+
+  std::vector<NumericT> con_A_I_J_r(sz_blocks, static_cast<NumericT>(0));
+  std::vector<NumericT> b_v_r(bv_size, static_cast<NumericT>(0));
+
+  block_matrix g_A_I_J_r_vcl;
+  block_vector g_bv_r_vcl;
+  g_A_I_J_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                    static_cast<unsigned int>(sizeof(NumericT)*sz_blocks),
+                                                    &(con_A_I_J_r[0]));
+  g_A_I_J_r_vcl.handle().context(opencl_ctx);
+
+  g_A_I_J_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                     static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
+                                                     &(matrix_dims[0]));
+  g_A_I_J_r_vcl.handle1().context(opencl_ctx);
+
+  g_A_I_J_r_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                     static_cast<unsigned int>(sizeof(cl_uint)*static_cast<unsigned int>(g_I.size() + 1)),
+                                                     &(blocks_ind[0]));
+  g_A_I_J_r_vcl.handle2().context(opencl_ctx);
+
+  g_bv_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                 static_cast<unsigned int>(sizeof(NumericT)*bv_size),
+                                                 &(b_v_r[0]));
+  g_bv_r_vcl.handle().context(opencl_ctx);
+
+  g_bv_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                  static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                                  &(start_bv_r_inds[0]));
+  g_bv_r_vcl.handle().context(opencl_ctx);
+
+  viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                           static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
+                                                                           &(g_is_update[0]));
+  viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+  viennacl::ocl::kernel& r_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_r_assembly");
+  r_assembly_kernel.local_work_size(0, 1);
+  r_assembly_kernel.global_work_size(0, 256);
+
+  viennacl::ocl::enqueue(r_assembly_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(),
+                                          g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(),
+                                          g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(),
+                                          g_A_I_J_r_vcl.handle(), g_A_I_J_r_vcl.handle2(), g_A_I_J_r_vcl.handle1(),
+                                          g_is_update_vcl, static_cast<cl_uint>(g_I.size())));
+
+  viennacl::ocl::kernel & bv_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_bv_assembly");
+  bv_assembly_kernel.local_work_size(0, 1);
+  bv_assembly_kernel.global_work_size(0, 256);
+  viennacl::ocl::enqueue(bv_assembly_kernel(g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_bv_vcl_u.handle(),
+                                            g_bv_vcl_u.handle1(), g_A_I_J_u_vcl.handle1(),
+                                            g_bv_r_vcl.handle(), g_bv_r_vcl.handle1(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl,
+                                            static_cast<cl_uint>(g_I.size())));
+  g_bv_vcl.handle() = g_bv_r_vcl.handle();
+  g_bv_vcl.handle1() = g_bv_r_vcl.handle1();
+
+  g_A_I_J_vcl.handle() = g_A_I_J_r_vcl.handle();
+  g_A_I_J_vcl.handle2() = g_A_I_J_r_vcl.handle2();
+  g_A_I_J_vcl.handle1() = g_A_I_J_r_vcl.handle1();
+}
+
+/** @brief GPU-based block update
+ *
+ * @param A            sparse matrix
+ * @param A_v_c        vectorized column-wise initial matrix
+ * @param g_is_update  container with identificators that shows which block should be modified
+ * @param g_res        container of residuals for all columns
+ * @param g_J          container of column index sets for all columns
+ * @param g_I          container of row index sets for all columns
+ * @param g_A_I_J_vcl  container of block matrices from previous update
+ * @param g_bv_vcl     block of beta vectors from previous iteration
+ * @param tag          SPAI configuration tag
+ */
+template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT>
+void block_update(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+                  std::vector<SparseVectorT> const & A_v_c,
+                  std::vector<cl_uint> & g_is_update,
+                  std::vector<SparseVectorT> & g_res,
+                  std::vector<std::vector<unsigned int> > & g_J,
+                  std::vector<std::vector<unsigned int> > & g_I,
+                  block_matrix & g_A_I_J_vcl,
+                  block_vector & g_bv_vcl,
+                  spai_tag const & tag)
+{
+  viennacl::context ctx = viennacl::traits::context(A);
+  //updated index set for columns
+  std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
+  //updated index set for rows
+  std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
+  //mixed index set of old and updated indices for rows
+  std::vector<std::vector<unsigned int> > g_I_q(g_J.size());
+  //GPU memory for A_I_\hatJ
+  block_matrix g_A_I_J_u_vcl;
+  //GPU memory for A_\hatI_\hatJ
+  block_matrix g_A_I_u_J_u_vcl;
+  bool is_empty_block;
+  //GPU memory for new b_v
+  block_vector g_bv_u_vcl;
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i = 0; i < static_cast<long>(g_J.size()); ++i)
+  {
+    if (g_is_update[static_cast<vcl_size_t>(i)])
+    {
+      if (buildAugmentedIndexSet<SparseVectorT, NumericT>(A_v_c, g_res[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], tag))
+          buildNewRowSet(A_v_c, g_I[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)]);
+    }
+  }
+  //assemble new A_I_J_u blocks on GPU and multiply them with Q'
+  block_assembly(A, g_J_u, g_I, g_A_I_J_u_vcl, g_is_update, is_empty_block);
+  //I have matrix A_I_J_u ready..
+  block_q_multiplication<NumericT>(g_J_u, g_I, g_A_I_J_vcl, g_bv_vcl, g_A_I_J_u_vcl, g_is_update, ctx);
+  //assemble A_\hatI_\hatJ
+  block_assembly(A, g_J_u, g_I_u, g_A_I_u_J_u_vcl, g_is_update, is_empty_block);
+  assemble_qr_block<NumericT>(g_J, g_I, g_J_u, g_I_u, g_I_q, g_A_I_J_u_vcl, g_A_I_J_vcl.handle1(),
+                              g_A_I_u_J_u_vcl, g_is_update, is_empty_block, ctx);
+
+  block_qr<NumericT>(g_I_q, g_J_u, g_A_I_u_J_u_vcl, g_bv_u_vcl, g_is_update, ctx);
+  //concatanation of new and old indices
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i = 0; i < static_cast<long>(g_J.size()); ++i)
+  {
+    g_J[static_cast<vcl_size_t>(i)].insert(g_J[static_cast<vcl_size_t>(i)].end(), g_J_u[static_cast<vcl_size_t>(i)].begin(), g_J_u[static_cast<vcl_size_t>(i)].end());
+    g_I[static_cast<vcl_size_t>(i)].insert(g_I[static_cast<vcl_size_t>(i)].end(), g_I_u[static_cast<vcl_size_t>(i)].begin(), g_I_u[static_cast<vcl_size_t>(i)].end());
+  }
+  assemble_r<NumericT>(g_I, g_J, g_A_I_J_vcl, g_A_I_J_u_vcl, g_A_I_u_J_u_vcl,  g_bv_vcl,  g_bv_u_vcl, g_is_update, ctx);
+}
+
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-static.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-static.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-static.hpp
new file mode 100644
index 0000000..0fd11146
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-static.hpp
@@ -0,0 +1,192 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_STATIC_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_STATIC_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/spai/spai-static.hpp
+    @brief Implementation of a static SPAI. Experimental.
+
+    SPAI code contributed by Nikolay Lukash
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+//#include "spai-dynamic.hpp"
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/lu.hpp"
+#include "boost/numeric/ublas/triangular.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+// ViennaCL includes
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+
+//#include "boost/numeric/ublas/detail/matrix_assign.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/** @brief Determines if element ind is in set {J}
+ *
+ * @param J     current set
+ * @param ind   current element
+ */
+template<typename SizeT>
+bool isInIndexSet(std::vector<SizeT> const & J, SizeT ind)
+{
+  return (std::find(J.begin(), J.end(), ind) != J.end());
+}
+
+
+
+/********************************* STATIC SPAI FUNCTIONS******************************************/
+
+/** @brief Projects solution of LS problem onto original column m
+ *
+ * @param m_in   solution of LS
+ * @param J      set of non-zero columns
+ * @param m      original column of M
+ */
+template<typename VectorT, typename SparseVectorT>
+void fanOutVector(VectorT const & m_in, std::vector<unsigned int> const & J, SparseVectorT & m)
+{
+  unsigned int  cnt = 0;
+  for (vcl_size_t i = 0; i < J.size(); ++i)
+    m[J[i]] = m_in(cnt++);
+}
+
+/** @brief Solution of linear:R*x=y system by backward substitution
+ *
+ * @param R   uppertriangular matrix
+ * @param y   right handside vector
+ * @param x   solution vector
+ */
+template<typename MatrixT, typename VectorT>
+void backwardSolve(MatrixT const & R, VectorT const & y, VectorT & x)
+{
+  for (long i2 = static_cast<long>(R.size2())-1; i2 >= 0; i2--)
+  {
+    vcl_size_t i = static_cast<vcl_size_t>(i2);
+    x(i) = y(i);
+    for (vcl_size_t j = static_cast<vcl_size_t>(i)+1; j < R.size2(); ++j)
+      x(i) -= R(i,j)*x(j);
+
+    x(i) /= R(i,i);
+  }
+}
+
+/** @brief Perform projection of set I on the unit-vector
+ *
+ * @param I     set of non-zero rows
+ * @param y     result vector
+ * @param ind   index of unit vector
+ */
+template<typename VectorT, typename NumericT>
+void projectI(std::vector<unsigned int> const & I, VectorT & y, unsigned int ind)
+{
+  for (vcl_size_t i = 0; i < I.size(); ++i)
+  {
+    //y.resize(y.size()+1);
+    if (I[i] == ind)
+      y(i) = NumericT(1.0);
+    else
+      y(i) = NumericT(0.0);
+  }
+}
+
+/** @brief Builds index set of projected columns for current column of preconditioner
+ *
+ * @param v    current column of preconditioner
+ * @param J    output - index set of non-zero columns
+ */
+template<typename SparseVectorT>
+void buildColumnIndexSet(SparseVectorT const & v, std::vector<unsigned int> & J)
+{
+  for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it != v.end(); ++vec_it)
+    J.push_back(vec_it->first);
+
+  std::sort(J.begin(), J.end());
+}
+
+/** @brief Initialize preconditioner with sparcity pattern = p(A)
+ *
+ * @param A   input matrix
+ * @param M   output matrix - initialized preconditioner
+ */
+template<typename SparseMatrixT>
+void initPreconditioner(SparseMatrixT const & A, SparseMatrixT & M)
+{
+  typedef typename SparseMatrixT::value_type      NumericType;
+
+  M.resize(A.size1(), A.size2(), false);
+  for (typename SparseMatrixT::const_iterator1 row_it = A.begin1(); row_it!= A.end1(); ++row_it)
+    for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+      M(col_it.index1(),col_it.index2()) = NumericType(1);
+}
+
+/** @brief Row projection for matrix A(:,J) -> A(I,J), building index set of non-zero rows
+ *
+ * @param A_v_c   input matrix
+ * @param J       set of non-zero rows
+ * @param I       output matrix
+ */
+template<typename SparseVectorT>
+void projectRows(std::vector<SparseVectorT> const & A_v_c,
+                 std::vector<unsigned int> const & J,
+                 std::vector<unsigned int>       & I)
+{
+  for (vcl_size_t i = 0; i < J.size(); ++i)
+  {
+    for (typename SparseVectorT::const_iterator col_it = A_v_c[J[i]].begin(); col_it!=A_v_c[J[i]].end(); ++col_it)
+    {
+      if (!isInIndexSet(I, col_it->first))
+        I.push_back(col_it->first);
+    }
+  }
+  std::sort(I.begin(), I.end());
+}
+
+
+} //namespace spai
+} //namespace detail
+} //namespace linalg
+} //namespace viennacl
+
+#endif


[16/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp
new file mode 100644
index 0000000..540ff82
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp
@@ -0,0 +1,33 @@
+#ifndef VIENNACL_LINALG_ILU_HPP_
+#define VIENNACL_LINALG_ILU_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/ilu.hpp
+    @brief Implementations of incomplete factorization preconditioners. Convenience header file.
+*/
+
+#include "viennacl/linalg/detail/ilu/ilut.hpp"
+#include "viennacl/linalg/detail/ilu/ilu0.hpp"
+#include "viennacl/linalg/detail/ilu/block_ilu.hpp"
+#include "viennacl/linalg/detail/ilu/chow_patel_ilu.hpp"
+
+#endif
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp
new file mode 100644
index 0000000..febd347
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp
@@ -0,0 +1,334 @@
+#ifndef VIENNACL_LINALG_ILU_OPERATIONS_HPP_
+#define VIENNACL_LINALG_ILU_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/ilu_operations.hpp
+    @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/range.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/ilu_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/ilu_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/ilu_operations.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief Extracts the lower triangular part L from A.
+  *
+  * Diagonal of L is stored explicitly in order to enable better code reuse.
+  *
+  */
+template<typename NumericT>
+void extract_L(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::extract_L(A, L);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::extract_L(A, L);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::extract_L(A, L);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L accordingly.
+  *
+  * Since A should not be modified (const-correctness), updates are in L.
+  *
+  */
+template<typename NumericT>
+void icc_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::icc_scale(A, L);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::icc_scale(A, L);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::icc_scale(A, L);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ICC (cf. Algorithm 3 in paper, but for L rather than U)
+  *
+  * We use a fully synchronous (Jacobi-like) variant, because asynchronous methods as described in the paper are a nightmare to debug
+  * (and particularly funny if they sometimes fail, sometimes not)
+  *
+  * @param L       Factor L to be updated for the incomplete Cholesky factorization
+  * @param aij_L   Lower triangular potion from system matrix
+  */
+template<typename NumericT>
+void icc_chow_patel_sweep(compressed_matrix<NumericT>       & L,
+                          vector<NumericT>                  & aij_L)
+{
+  switch (viennacl::traits::handle(L).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::icc_chow_patel_sweep(L, aij_L);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::icc_chow_patel_sweep(L, aij_L);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::icc_chow_patel_sweep(L, aij_L);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+
+//////////////////////// ILU ////////////////////
+
+/** @brief Extracts the lower triangular part L and the upper triangular part U from A.
+  *
+  * Diagonals of L and U are stored explicitly in order to enable better code reuse.
+  *
+  */
+template<typename NumericT>
+void extract_LU(compressed_matrix<NumericT> const & A,
+                compressed_matrix<NumericT>       & L,
+                compressed_matrix<NumericT>       & U)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::extract_LU(A, L, U);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::extract_LU(A, L, U);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::extract_LU(A, L, U);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly.
+  *
+  * Since A should not be modified (const-correctness), updates are in L and U.
+  *
+  */
+template<typename NumericT>
+void ilu_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L,
+               compressed_matrix<NumericT>       & U)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::ilu_scale(A, L, U);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::ilu_scale(A, L, U);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::ilu_scale(A, L, U);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Transposition B <- A^T, where the aij-vector is permuted in the same way as the value array in A when assigned to B
+  *
+  * @param A     Input matrix to be transposed
+  * @param B     Output matrix containing the transposed matrix
+  */
+template<typename NumericT>
+void ilu_transpose(compressed_matrix<NumericT> const & A,
+                   compressed_matrix<NumericT>       & B)
+{
+  viennacl::context orig_ctx = viennacl::traits::context(A);
+  viennacl::context cpu_ctx(viennacl::MAIN_MEMORY);
+  (void)orig_ctx;
+  (void)cpu_ctx;
+
+  viennacl::compressed_matrix<NumericT> A_host(0, 0, 0, cpu_ctx);
+  (void)A_host;
+
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::ilu_transpose(A, B);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    A_host = A;
+    B.switch_memory_context(cpu_ctx);
+    viennacl::linalg::host_based::ilu_transpose(A_host, B);
+    B.switch_memory_context(orig_ctx);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    A_host = A;
+    B.switch_memory_context(cpu_ctx);
+    viennacl::linalg::host_based::ilu_transpose(A_host, B);
+    B.switch_memory_context(orig_ctx);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU (cf. Algorithm 2 in paper)
+  *
+  * We use a fully synchronous (Jacobi-like) variant, because asynchronous methods as described in the paper are a nightmare to debug
+  * (and particularly funny if they sometimes fail, sometimes not)
+  *
+  * @param L            Lower-triangular matrix L in LU factorization
+  * @param aij_L        Lower-triangular matrix L from A
+  * @param U_trans      Upper-triangular matrix U in CSC-storage, which is the same as U^trans in CSR-storage
+  * @param aij_U_trans  Upper-triangular matrix from A in CSC-storage, which is the same as U^trans in CSR-storage
+  */
+template<typename NumericT>
+void ilu_chow_patel_sweep(compressed_matrix<NumericT>       & L,
+                          vector<NumericT>            const & aij_L,
+                          compressed_matrix<NumericT>       & U_trans,
+                          vector<NumericT>            const & aij_U_trans)
+{
+  switch (viennacl::traits::handle(L).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Extracts the lower triangular part L and the upper triangular part U from A.
+  *
+  * Diagonals of L and U are stored explicitly in order to enable better code reuse.
+  *
+  */
+template<typename NumericT>
+void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
+                             vector<NumericT> & diag_R)
+{
+  switch (viennacl::traits::handle(R).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::ilu_form_neumann_matrix(R, diag_R);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::ilu_form_neumann_matrix(R, diag_R);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::ilu_form_neumann_matrix(R, diag_R);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp
new file mode 100644
index 0000000..b31a82a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp
@@ -0,0 +1,186 @@
+#ifndef VIENNACL_LINALG_INNER_PROD_HPP_
+#define VIENNACL_LINALG_INNER_PROD_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/inner_prod.hpp
+    @brief Generic interface for the computation of inner products. See viennacl/linalg/vector_operations.hpp for implementations.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+#include "viennacl/meta/result_of.hpp"
+
+namespace viennacl
+{
+//
+// generic inner_prod function
+//   uses tag dispatch to identify which algorithm
+//   should be called
+//
+namespace linalg
+{
+
+#ifdef VIENNACL_WITH_ARMADILLO
+// ----------------------------------------------------
+// Armadillo
+//
+template<typename NumericT>
+NumericT inner_prod(arma::Col<NumericT> const& v1, arma::Col<NumericT> const& v2)
+{
+  return dot(v1, v2);
+}
+#endif
+
+#ifdef VIENNACL_WITH_EIGEN
+// ----------------------------------------------------
+// EIGEN
+//
+template<typename VectorT1, typename VectorT2>
+typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+                              typename VectorT1::RealScalar>::type
+inner_prod(VectorT1 const & v1, VectorT2 const & v2)
+{
+  //std::cout << "eigen .. " << std::endl;
+  return v1.dot(v2);
+}
+#endif
+
+#ifdef VIENNACL_WITH_MTL4
+// ----------------------------------------------------
+// MTL4
+//
+template<typename VectorT1, typename VectorT2>
+typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+                              typename VectorT1::value_type>::type
+inner_prod(VectorT1 const & v1, VectorT2 const & v2)
+{
+  //std::cout << "mtl4 .. " << std::endl;
+  return mtl::dot(v1, v2);
+}
+#endif
+
+#ifdef VIENNACL_WITH_UBLAS
+// ----------------------------------------------------
+// UBLAS
+//
+template<typename VectorT1, typename VectorT2>
+typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+                              typename VectorT1::value_type>::type
+inner_prod(VectorT1 const & v1, VectorT2 const & v2)
+{
+  //std::cout << "ublas .. " << std::endl;
+  return boost::numeric::ublas::inner_prod(v1, v2);
+}
+#endif
+
+// ----------------------------------------------------
+// STL
+//
+template<typename VectorT1, typename VectorT2>
+typename viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+                              typename VectorT1::value_type>::type
+inner_prod(VectorT1 const & v1, VectorT2 const & v2)
+{
+  assert(v1.size() == v2.size() && bool("Vector sizes mismatch"));
+  //std::cout << "stl .. " << std::endl;
+  typename VectorT1::value_type result = 0;
+  for (typename VectorT1::size_type i=0; i<v1.size(); ++i)
+    result += v1[i] * v2[i];
+
+  return result;
+}
+
+// ----------------------------------------------------
+// VIENNACL
+//
+template<typename NumericT>
+viennacl::scalar_expression< const vector_base<NumericT>, const vector_base<NumericT>, viennacl::op_inner_prod >
+inner_prod(vector_base<NumericT> const & vector1,
+           vector_base<NumericT> const & vector2)
+{
+  //std::cout << "viennacl .. " << std::endl;
+  return viennacl::scalar_expression< const vector_base<NumericT>,
+                                      const vector_base<NumericT>,
+                                      viennacl::op_inner_prod >(vector1, vector2);
+}
+
+
+// expression on lhs:
+template< typename LHS, typename RHS, typename OP, typename NumericT>
+viennacl::scalar_expression< const viennacl::vector_expression<LHS, RHS, OP>,
+                             const vector_base<NumericT>,
+                             viennacl::op_inner_prod >
+inner_prod(viennacl::vector_expression<LHS, RHS, OP> const & vector1,
+           vector_base<NumericT> const & vector2)
+{
+  //std::cout << "viennacl .. " << std::endl;
+  return viennacl::scalar_expression< const viennacl::vector_expression<LHS, RHS, OP>,
+                                      const vector_base<NumericT>,
+                                      viennacl::op_inner_prod >(vector1, vector2);
+}
+
+// expression on rhs:
+template<typename NumericT, typename LHS, typename RHS, typename OP>
+viennacl::scalar_expression< const vector_base<NumericT>,
+                             const viennacl::vector_expression<LHS, RHS, OP>,
+                             viennacl::op_inner_prod >
+inner_prod(vector_base<NumericT> const & vector1,
+           viennacl::vector_expression<LHS, RHS, OP> const & vector2)
+{
+  //std::cout << "viennacl .. " << std::endl;
+  return viennacl::scalar_expression< const vector_base<NumericT>,
+                                      const viennacl::vector_expression<LHS, RHS, OP>,
+                                      viennacl::op_inner_prod >(vector1, vector2);
+}
+
+// expression on lhs and rhs:
+template<typename LHS1, typename RHS1, typename OP1,
+         typename LHS2, typename RHS2, typename OP2>
+viennacl::scalar_expression< const viennacl::vector_expression<LHS1, RHS1, OP1>,
+                             const viennacl::vector_expression<LHS2, RHS2, OP2>,
+                             viennacl::op_inner_prod >
+inner_prod(viennacl::vector_expression<LHS1, RHS1, OP1> const & vector1,
+           viennacl::vector_expression<LHS2, RHS2, OP2> const & vector2)
+{
+  //std::cout << "viennacl .. " << std::endl;
+  return viennacl::scalar_expression< const viennacl::vector_expression<LHS1, RHS1, OP1>,
+                                      const viennacl::vector_expression<LHS2, RHS2, OP2>,
+                                      viennacl::op_inner_prod >(vector1, vector2);
+}
+
+
+// Multiple inner products:
+template<typename NumericT>
+viennacl::vector_expression< const vector_base<NumericT>, const vector_tuple<NumericT>, viennacl::op_inner_prod >
+inner_prod(vector_base<NumericT> const & x,
+           vector_tuple<NumericT> const & y_tuple)
+{
+  return viennacl::vector_expression< const vector_base<NumericT>,
+                                      const vector_tuple<NumericT>,
+                                      viennacl::op_inner_prod >(x, y_tuple);
+}
+
+
+} // end namespace linalg
+} // end namespace viennacl
+#endif
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp
new file mode 100644
index 0000000..78a813d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp
@@ -0,0 +1,425 @@
+#ifndef VIENNACL_LINALG_ITERATIVE_OPERATIONS_HPP_
+#define VIENNACL_LINALG_ITERATIVE_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/iterative_operations.hpp
+    @brief Implementations of specialized routines for the iterative solvers.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/range.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/iterative_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/iterative_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/iterative_operations.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for vectors 'result', 'p', 'r', 'Ap':
+  *   result += alpha * p;
+  *   r      -= alpha * Ap;
+  *   p       = r + beta * p;
+  * and runs the parallel reduction stage for computing inner_prod(r,r)
+  */
+template<typename NumericT>
+void pipelined_cg_vector_update(vector_base<NumericT> & result,
+                                NumericT alpha,
+                                vector_base<NumericT> & p,
+                                vector_base<NumericT> & r,
+                                vector_base<NumericT> const & Ap,
+                                NumericT beta,
+                                vector_base<NumericT> & inner_prod_buffer)
+{
+  switch (viennacl::traits::handle(result).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_cg_vector_update(result, alpha, p, r, Ap, beta, inner_prod_buffer);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_cg_vector_update(result, alpha, p, r, Ap, beta, inner_prod_buffer);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_cg_vector_update(result, alpha, p, r, Ap, beta, inner_prod_buffer);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for a matrix A and vectors 'p' and 'Ap':
+  *   Ap = prod(A, p);
+  * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+  */
+template<typename MatrixT, typename NumericT>
+void pipelined_cg_prod(MatrixT const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  switch (viennacl::traits::handle(p).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+////////////////////////////////////////////
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for vectors 's', 'r', 'Ap':
+  *   s = r - alpha * Ap
+  * with alpha obtained from a reduction step on the 0th and the 3rd out of 6 chunks in inner_prod_buffer
+  * and runs the parallel reduction stage for computing inner_prod(s,s)
+  */
+template<typename NumericT>
+void pipelined_bicgstab_update_s(vector_base<NumericT> & s,
+                                 vector_base<NumericT> & r,
+                                 vector_base<NumericT> const & Ap,
+                                 vector_base<NumericT> & inner_prod_buffer,
+                                 vcl_size_t buffer_chunk_size,
+                                 vcl_size_t buffer_chunk_offset)
+{
+  switch (viennacl::traits::handle(s).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_bicgstab_update_s(s, r, Ap, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_bicgstab_update_s(s, r, Ap, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_bicgstab_update_s(s, r, Ap, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm.
+  *
+  * x_{j+1} = x_j + alpha * p_j + omega * s_j
+  * r_{j+1} = s_j - omega * t_j
+  * p_{j+1} = r_{j+1} + beta * (p_j - omega * q_j)
+  * and compute first stage of r_dot_r0 = <r_{j+1}, r_o^*> for use in next iteration
+  */
+template<typename NumericT>
+void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,
+                                      vector_base<NumericT> & residual, vector_base<NumericT> const & As,
+                                      NumericT beta, vector_base<NumericT> const & Ap,
+                                      vector_base<NumericT> const & r0star,
+                                      vector_base<NumericT> & inner_prod_buffer,
+                                      vcl_size_t buffer_chunk_size)
+{
+  switch (viennacl::traits::handle(s).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_bicgstab_vector_update(result, alpha, p, omega, s, residual, As, beta, Ap, r0star, inner_prod_buffer, buffer_chunk_size);
+    break;
+  #ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_bicgstab_vector_update(result, alpha, p, omega, s, residual, As, beta, Ap, r0star, inner_prod_buffer, buffer_chunk_size);
+    break;
+  #endif
+  #ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_bicgstab_vector_update(result, alpha, p, omega, s, residual, As, beta, Ap, r0star, inner_prod_buffer, buffer_chunk_size);
+    break;
+  #endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for a matrix A and vectors 'p' and 'Ap':
+  *   Ap = prod(A, p);
+  * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+  */
+template<typename MatrixT, typename NumericT>
+void pipelined_bicgstab_prod(MatrixT const & A,
+                             vector_base<NumericT> const & p,
+                             vector_base<NumericT> & Ap,
+                             vector_base<NumericT> const & r0star,
+                             vector_base<NumericT> & inner_prod_buffer,
+                             vcl_size_t buffer_chunk_size,
+                             vcl_size_t buffer_chunk_offset)
+{
+  switch (viennacl::traits::handle(p).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_bicgstab_prod(A, p, Ap, r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_bicgstab_prod(A, p, Ap, r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_bicgstab_prod(A, p, Ap, r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+////////////////////////////////////////////
+
+/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
+  *
+  * This routines computes for vectors 'r', 'v_k':
+  *   Second reduction step for ||v_k||
+  *   v_k /= ||v_k||
+  *   First reduction step for <r, v_k>
+  */
+template <typename T>
+void pipelined_gmres_normalize_vk(vector_base<T> & v_k,
+                                  vector_base<T> const & residual,
+                                  vector_base<T> & R_buffer,
+                                  vcl_size_t offset_in_R,
+                                  vector_base<T> const & inner_prod_buffer,
+                                  vector_base<T> & r_dot_vk_buffer,
+                                  vcl_size_t buffer_chunk_size,
+                                  vcl_size_t buffer_chunk_offset)
+{
+  switch (viennacl::traits::handle(v_k).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_gmres_normalize_vk(v_k, residual, R_buffer, offset_in_R, inner_prod_buffer, r_dot_vk_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_gmres_normalize_vk(v_k, residual, R_buffer, offset_in_R, inner_prod_buffer, r_dot_vk_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_gmres_normalize_vk(v_k, residual, R_buffer, offset_in_R, inner_prod_buffer, r_dot_vk_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+
+/** @brief Computes the first reduction stage for multiple inner products <v_i, v_k>, i=0..k-1
+  *
+  *  All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size'
+  */
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
+                                         vcl_size_t v_k_size,
+                                         vcl_size_t v_k_internal_size,
+                                         vcl_size_t k,
+                                         vector_base<T> & vi_in_vk_buffer,
+                                         vcl_size_t buffer_chunk_size)
+{
+  switch (viennacl::traits::handle(device_krylov_basis).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, buffer_chunk_size);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, buffer_chunk_size);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, buffer_chunk_size);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+/** @brief Computes the second reduction stage for multiple inner products <v_i, v_k>, i=0..k-1, then updates v_k -= <v_i, v_k> v_i and computes the first reduction stage for ||v_k||
+  *
+  *  All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size'
+  */
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis,
+                                         vcl_size_t v_k_size,
+                                         vcl_size_t v_k_internal_size,
+                                         vcl_size_t k,
+                                         vector_base<T> const & vi_in_vk_buffer,
+                                         vector_base<T> & R_buffer,
+                                         vcl_size_t krylov_dim,
+                                         vector_base<T> & inner_prod_buffer,
+                                         vcl_size_t buffer_chunk_size)
+{
+  switch (viennacl::traits::handle(device_krylov_basis).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, R_buffer, krylov_dim, inner_prod_buffer, buffer_chunk_size);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, R_buffer, krylov_dim, inner_prod_buffer, buffer_chunk_size);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, R_buffer, krylov_dim, inner_prod_buffer, buffer_chunk_size);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+/** @brief Computes x += eta_0 r + sum_{i=1}^{k-1} eta_i v_{i-1} */
+template <typename T>
+void pipelined_gmres_update_result(vector_base<T> & result,
+                                   vector_base<T> const & residual,
+                                   vector_base<T> const & krylov_basis,
+                                   vcl_size_t v_k_size,
+                                   vcl_size_t v_k_internal_size,
+                                   vector_base<T> const & coefficients,
+                                   vcl_size_t k)
+{
+  switch (viennacl::traits::handle(result).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_gmres_update_result(result, residual, krylov_basis, v_k_size, v_k_internal_size, coefficients, k);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_gmres_update_result(result, residual, krylov_basis, v_k_size, v_k_internal_size, coefficients, k);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_gmres_update_result(result, residual, krylov_basis, v_k_size, v_k_internal_size, coefficients, k);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined GMRES algorithm.
+  *
+  * This routines computes for a matrix A and vectors 'p' and 'Ap':
+  *   Ap = prod(A, p);
+  * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+  */
+template <typename MatrixType, typename T>
+void pipelined_gmres_prod(MatrixType const & A,
+                       vector_base<T> const & p,
+                       vector_base<T> & Ap,
+                       vector_base<T> & inner_prod_buffer)
+{
+  switch (viennacl::traits::handle(p).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_gmres_prod(A, p, Ap, inner_prod_buffer);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_gmres_prod(A, p, Ap, inner_prod_buffer);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_gmres_prod(A, p, Ap, inner_prod_buffer);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp
new file mode 100644
index 0000000..0b16964
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp
@@ -0,0 +1,141 @@
+#ifndef VIENNACL_LINALG_JACOBI_PRECOND_HPP_
+#define VIENNACL_LINALG_JACOBI_PRECOND_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/jacobi_precond.hpp
+    @brief Implementation of a simple Jacobi preconditioner
+*/
+
+#include <vector>
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/row_scaling.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for a jacobi preconditioner
+*/
+class jacobi_tag {};
+
+
+/** @brief Jacobi preconditioner class, can be supplied to solve()-routines. Generic version for non-ViennaCL matrices.
+*/
+template<typename MatrixT,
+          bool is_viennacl = detail::row_scaling_for_viennacl<MatrixT>::value >
+class jacobi_precond
+{
+  typedef typename MatrixT::value_type      NumericType;
+
+  public:
+    jacobi_precond(MatrixT const & mat, jacobi_tag const &) : diag_A_(viennacl::traits::size1(mat))
+    {
+      init(mat);
+    }
+
+    void init(MatrixT const & mat)
+    {
+      diag_A_.resize(viennacl::traits::size1(mat));  //resize without preserving values
+
+      for (typename MatrixT::const_iterator1 row_it = mat.begin1();
+            row_it != mat.end1();
+            ++row_it)
+      {
+        bool diag_found = false;
+        for (typename MatrixT::const_iterator2 col_it = row_it.begin();
+              col_it != row_it.end();
+              ++col_it)
+        {
+          if (col_it.index1() == col_it.index2())
+          {
+            diag_A_[col_it.index1()] = *col_it;
+            diag_found = true;
+          }
+        }
+        if (!diag_found)
+          throw zero_on_diagonal_exception("ViennaCL: Zero in diagonal encountered while setting up Jacobi preconditioner!");
+      }
+    }
+
+
+    /** @brief Apply to res = b - Ax, i.e. jacobi applied vec (right hand side),  */
+    template<typename VectorT>
+    void apply(VectorT & vec) const
+    {
+      assert(viennacl::traits::size(diag_A_) == viennacl::traits::size(vec) && bool("Size mismatch"));
+      for (vcl_size_t i=0; i<diag_A_.size(); ++i)
+        vec[i] /= diag_A_[i];
+    }
+
+  private:
+    std::vector<NumericType> diag_A_;
+};
+
+
+/** @brief Jacobi preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename MatrixT>
+class jacobi_precond<MatrixT, true>
+{
+    typedef typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type  NumericType;
+
+  public:
+    jacobi_precond(MatrixT const & mat, jacobi_tag const &) : diag_A_(mat.size1(), viennacl::traits::context(mat))
+    {
+      init(mat);
+    }
+
+
+    void init(MatrixT const & mat)
+    {
+      detail::row_info(mat, diag_A_, detail::SPARSE_ROW_DIAGONAL);
+    }
+
+
+    template<unsigned int AlignmentV>
+    void apply(viennacl::vector<NumericType, AlignmentV> & vec) const
+    {
+      assert(viennacl::traits::size(diag_A_) == viennacl::traits::size(vec) && bool("Size mismatch"));
+      vec = element_div(vec, diag_A_);
+    }
+
+  private:
+    viennacl::vector<NumericType> diag_A_;
+};
+
+}
+}
+
+
+
+
+#endif
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp
new file mode 100644
index 0000000..ffac471
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp
@@ -0,0 +1,515 @@
+#ifndef VIENNACL_LINALG_LANCZOS_HPP_
+#define VIENNACL_LINALG_LANCZOS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/lanczos.hpp
+*   @brief Generic interface for the Lanczos algorithm.
+*
+*   Contributed by Guenther Mader and Astrid Rupp.
+*/
+
+#include <cmath>
+#include <vector>
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/io/matrix_market.hpp"
+#include "viennacl/linalg/bisect.hpp"
+#include "viennacl/tools/random.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for the lanczos algorithm.
+*/
+class lanczos_tag
+{
+public:
+
+  enum
+  {
+    partial_reorthogonalization = 0,
+    full_reorthogonalization,
+    no_reorthogonalization
+  };
+
+  /** @brief The constructor
+  *
+  * @param factor                 Exponent of epsilon - tolerance for batches of Reorthogonalization
+  * @param numeig                 Number of eigenvalues to be returned
+  * @param met                    Method for Lanczos-Algorithm: 0 for partial Reorthogonalization, 1 for full Reorthogonalization and 2 for Lanczos without Reorthogonalization
+  * @param krylov                 Maximum krylov-space size
+  */
+
+  lanczos_tag(double factor = 0.75,
+              vcl_size_t numeig = 10,
+              int met = 0,
+              vcl_size_t krylov = 100) : factor_(factor), num_eigenvalues_(numeig), method_(met), krylov_size_(krylov) {}
+
+  /** @brief Sets the number of eigenvalues */
+  void num_eigenvalues(vcl_size_t numeig){ num_eigenvalues_ = numeig; }
+
+    /** @brief Returns the number of eigenvalues */
+  vcl_size_t num_eigenvalues() const { return num_eigenvalues_; }
+
+    /** @brief Sets the exponent of epsilon. Values between 0.6 and 0.9 usually give best results. */
+  void factor(double fct) { factor_ = fct; }
+
+  /** @brief Returns the exponent */
+  double factor() const { return factor_; }
+
+  /** @brief Sets the size of the kylov space. Must be larger than number of eigenvalues to compute. */
+  void krylov_size(vcl_size_t max) { krylov_size_ = max; }
+
+  /** @brief Returns the size of the kylov space */
+  vcl_size_t  krylov_size() const { return krylov_size_; }
+
+  /** @brief Sets the reorthogonalization method */
+  void method(int met){ method_ = met; }
+
+  /** @brief Returns the reorthogonalization method */
+  int method() const { return method_; }
+
+
+private:
+  double factor_;
+  vcl_size_t num_eigenvalues_;
+  int method_; // see enum defined above for possible values
+  vcl_size_t krylov_size_;
+};
+
+
+namespace detail
+{
+  /** @brief Inverse iteration for finding an eigenvector for an eigenvalue.
+   *
+   *  beta[0] to be ignored for consistency.
+   */
+  template<typename NumericT>
+  void inverse_iteration(std::vector<NumericT> const & alphas, std::vector<NumericT> const & betas,
+                         NumericT & eigenvalue, std::vector<NumericT> & eigenvector)
+  {
+    std::vector<NumericT> alpha_sweeped = alphas;
+    for (vcl_size_t i=0; i<alpha_sweeped.size(); ++i)
+      alpha_sweeped[i] -= eigenvalue;
+    for (vcl_size_t row=1; row < alpha_sweeped.size(); ++row)
+      alpha_sweeped[row] -= betas[row] * betas[row] / alpha_sweeped[row-1];
+
+    // starting guess: ignore last equation
+    eigenvector[alphas.size() - 1] = 1.0;
+
+    for (vcl_size_t iter=0; iter<1; ++iter)
+    {
+      // solve first n-1 equations (A - \lambda I) y = -beta[n]
+      eigenvector[alphas.size() - 1] /= alpha_sweeped[alphas.size() - 1];
+      for (vcl_size_t row2=1; row2 < alphas.size(); ++row2)
+      {
+        vcl_size_t row = alphas.size() - row2 - 1;
+        eigenvector[row] -= eigenvector[row+1] * betas[row+1];
+        eigenvector[row] /= alpha_sweeped[row];
+      }
+
+      // normalize eigenvector:
+      NumericT norm_vector = 0;
+      for (vcl_size_t i=0; i<eigenvector.size(); ++i)
+        norm_vector += eigenvector[i] * eigenvector[i];
+      norm_vector = std::sqrt(norm_vector);
+      for (vcl_size_t i=0; i<eigenvector.size(); ++i)
+        eigenvector[i] /= norm_vector;
+    }
+
+    //eigenvalue = (alphas[0] * eigenvector[0] + betas[1] * eigenvector[1]) / eigenvector[0];
+  }
+
+  /**
+  *   @brief Implementation of the Lanczos PRO algorithm (partial reorthogonalization)
+  *
+  *   @param A              The system matrix
+  *   @param r              Random start vector
+  *   @param eigenvectors_A Dense matrix holding the eigenvectors of A (one eigenvector per column)
+  *   @param size           Size of krylov-space
+  *   @param tag            Lanczos_tag with several options for the algorithm
+  *   @param compute_eigenvectors   Boolean flag. If true, eigenvectors are computed. Otherwise the routine returns after calculating eigenvalues.
+  *   @return               Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
+  */
+
+  template<typename MatrixT, typename DenseMatrixT, typename NumericT>
+  std::vector<NumericT>
+  lanczosPRO (MatrixT const& A, vector_base<NumericT> & r, DenseMatrixT & eigenvectors_A, vcl_size_t size, lanczos_tag const & tag, bool compute_eigenvectors)
+  {
+    // generation of some random numbers, used for lanczos PRO algorithm
+    viennacl::tools::normal_random_numbers<NumericT> get_N;
+
+    std::vector<vcl_size_t> l_bound(size/2), u_bound(size/2);
+    vcl_size_t n = r.size();
+    std::vector<NumericT> w(size), w_old(size); //w_k, w_{k-1}
+
+    NumericT inner_rt;
+    std::vector<NumericT> alphas, betas;
+    viennacl::matrix<NumericT, viennacl::column_major> Q(n, size); //column-major matrix holding the Krylov basis vectors
+
+    bool second_step = false;
+    NumericT eps = std::numeric_limits<NumericT>::epsilon();
+    NumericT squ_eps = std::sqrt(eps);
+    NumericT eta = std::exp(std::log(eps) * tag.factor());
+
+    NumericT beta = viennacl::linalg::norm_2(r);
+
+    r /= beta;
+
+    viennacl::vector_base<NumericT> q_0(Q.handle(), Q.size1(), 0, 1);
+    q_0 = r;
+
+    viennacl::vector<NumericT> u = viennacl::linalg::prod(A, r);
+    NumericT alpha = viennacl::linalg::inner_prod(u, r);
+    alphas.push_back(alpha);
+    w[0] = 1;
+    betas.push_back(beta);
+
+    vcl_size_t batches = 0;
+    for (vcl_size_t i = 1; i < size; i++) // Main loop for setting up the Krylov space
+    {
+      viennacl::vector_base<NumericT> q_iminus1(Q.handle(), Q.size1(), (i-1) * Q.internal_size1(), 1);
+      r = u - alpha * q_iminus1;
+      beta = viennacl::linalg::norm_2(r);
+
+      betas.push_back(beta);
+      r = r / beta;
+
+      //
+      // Update recurrence relation for estimating orthogonality loss
+      //
+      w_old = w;
+      w[0] = (betas[1] * w_old[1] + (alphas[0] - alpha) * w_old[0] - betas[i - 1] * w_old[0]) / beta + eps * 0.3 * get_N() * (betas[1] + beta);
+      for (vcl_size_t j = 1; j < i - 1; j++)
+        w[j] = (betas[j + 1] * w_old[j + 1] + (alphas[j] - alpha) * w_old[j] + betas[j] * w_old[j - 1] - betas[i - 1] * w_old[j]) / beta + eps * 0.3 * get_N() * (betas[j + 1] + beta);
+      w[i-1] = 0.6 * eps * NumericT(n) * get_N() * betas[1] / beta;
+
+      //
+      // Check whether there has been a need for reorthogonalization detected in the previous iteration.
+      // If so, run the reorthogonalization for each batch
+      //
+      if (second_step)
+      {
+        for (vcl_size_t j = 0; j < batches; j++)
+        {
+          for (vcl_size_t k = l_bound[j] + 1; k < u_bound[j] - 1; k++)
+          {
+            viennacl::vector_base<NumericT> q_k(Q.handle(), Q.size1(), k * Q.internal_size1(), 1);
+            inner_rt = viennacl::linalg::inner_prod(r, q_k);
+            r = r - inner_rt * q_k;
+            w[k] = 1.5 * eps * get_N();
+          }
+        }
+        NumericT temp = viennacl::linalg::norm_2(r);
+        r = r / temp;
+        beta = beta * temp;
+        second_step = false;
+      }
+      batches = 0;
+
+      //
+      // Check for semiorthogonality
+      //
+      for (vcl_size_t j = 0; j < i; j++)
+      {
+        if (std::fabs(w[j]) >= squ_eps) // tentative loss of orthonormality, hence reorthonomalize
+        {
+          viennacl::vector_base<NumericT> q_j(Q.handle(), Q.size1(), j * Q.internal_size1(), 1);
+          inner_rt = viennacl::linalg::inner_prod(r, q_j);
+          r = r - inner_rt * q_j;
+          w[j] = 1.5 * eps * get_N();
+          vcl_size_t k = j - 1;
+
+          // orthogonalization with respect to earlier basis vectors
+          while (std::fabs(w[k]) > eta)
+          {
+            viennacl::vector_base<NumericT> q_k(Q.handle(), Q.size1(), k * Q.internal_size1(), 1);
+            inner_rt = viennacl::linalg::inner_prod(r, q_k);
+            r = r - inner_rt * q_k;
+            w[k] = 1.5 * eps * get_N();
+            if (k == 0) break;
+            k--;
+          }
+          l_bound[batches] = k;
+
+          // orthogonalization with respect to later basis vectors
+          k = j + 1;
+          while (k < i && std::fabs(w[k]) > eta)
+          {
+            viennacl::vector_base<NumericT> q_k(Q.handle(), Q.size1(), k * Q.internal_size1(), 1);
+            inner_rt = viennacl::linalg::inner_prod(r, q_k);
+            r = r - inner_rt * q_k;
+            w[k] = 1.5 * eps * get_N();
+            k++;
+          }
+          u_bound[batches] = k - 1;
+          batches++;
+
+          j = k-1; // go to end of current batch
+        }
+      }
+
+      //
+      // Normalize basis vector and reorthogonalize as needed
+      //
+      if (batches > 0)
+      {
+        NumericT temp = viennacl::linalg::norm_2(r);
+        r = r / temp;
+        beta = beta * temp;
+        second_step = true;
+      }
+
+      // store Krylov vector in Q:
+      viennacl::vector_base<NumericT> q_i(Q.handle(), Q.size1(), i * Q.internal_size1(), 1);
+      q_i = r;
+
+      //
+      // determine and store alpha = <r, u> with u = A q_i - beta q_{i-1}
+      //
+      u = viennacl::linalg::prod(A, r);
+      u += (-beta) * q_iminus1;
+      alpha = viennacl::linalg::inner_prod(u, r);
+      alphas.push_back(alpha);
+    }
+
+    //
+    // Step 2: Compute eigenvalues of tridiagonal matrix obtained during Lanczos iterations:
+    //
+    std::vector<NumericT> eigenvalues = bisect(alphas, betas);
+
+    //
+    // Step 3: Compute eigenvectors via inverse iteration. Does not update eigenvalues, so only approximate by nature.
+    //
+    if (compute_eigenvectors)
+    {
+      std::vector<NumericT> eigenvector_tridiag(alphas.size());
+      for (std::size_t i=0; i < tag.num_eigenvalues(); ++i)
+      {
+        // compute eigenvector of tridiagonal matrix via inverse:
+        inverse_iteration(alphas, betas, eigenvalues[eigenvalues.size() - i - 1], eigenvector_tridiag);
+
+        // eigenvector w of full matrix A. Given as w = Q * u, where u is the eigenvector of the tridiagonal matrix
+        viennacl::vector<NumericT> eigenvector_u(eigenvector_tridiag.size());
+        viennacl::copy(eigenvector_tridiag, eigenvector_u);
+
+        viennacl::vector_base<NumericT> eigenvector_A(eigenvectors_A.handle(),
+                                                      eigenvectors_A.size1(),
+                                                      eigenvectors_A.row_major() ? i : i * eigenvectors_A.internal_size1(),
+                                                      eigenvectors_A.row_major() ? eigenvectors_A.internal_size2() : 1);
+        eigenvector_A = viennacl::linalg::prod(project(Q,
+                                                       range(0, Q.size1()),
+                                                       range(0, eigenvector_u.size())),
+                                               eigenvector_u);
+      }
+    }
+
+    return eigenvalues;
+  }
+
+
+  /**
+  *   @brief Implementation of the Lanczos FRO algorithm
+  *
+  *   @param A            The system matrix
+  *   @param r            Random start vector
+  *   @param eigenvectors_A  A dense matrix in which the eigenvectors of A will be stored. Both row- and column-major matrices are supported.
+  *   @param krylov_dim   Size of krylov-space
+  *   @param tag          The Lanczos tag holding tolerances, etc.
+  *   @param compute_eigenvectors   Boolean flag. If true, eigenvectors are computed. Otherwise the routine returns after calculating eigenvalues.
+  *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
+  */
+  template< typename MatrixT, typename DenseMatrixT, typename NumericT>
+  std::vector<NumericT>
+  lanczos(MatrixT const& A, vector_base<NumericT> & r, DenseMatrixT & eigenvectors_A, vcl_size_t krylov_dim, lanczos_tag const & tag, bool compute_eigenvectors)
+  {
+    std::vector<NumericT> alphas, betas;
+    viennacl::vector<NumericT> Aq(r.size());
+    viennacl::matrix<NumericT, viennacl::column_major> Q(r.size(), krylov_dim + 1);  // Krylov basis (each Krylov vector is one column)
+
+    NumericT norm_r = norm_2(r);
+    NumericT beta = norm_r;
+    r /= norm_r;
+
+    // first Krylov vector:
+    viennacl::vector_base<NumericT> q0(Q.handle(), Q.size1(), 0, 1);
+    q0 = r;
+
+    //
+    // Step 1: Run Lanczos' method to obtain tridiagonal matrix
+    //
+    for (vcl_size_t i = 0; i < krylov_dim; i++)
+    {
+      betas.push_back(beta);
+      // last available vector from Krylov basis:
+      viennacl::vector_base<NumericT> q_i(Q.handle(), Q.size1(), i * Q.internal_size1(), 1);
+
+      // Lanczos algorithm:
+      // - Compute A * q:
+      Aq = viennacl::linalg::prod(A, q_i);
+
+      // - Form Aq <- Aq - <Aq, q_i> * q_i - beta * q_{i-1}, where beta is ||q_i|| before normalization in previous iteration
+      NumericT alpha = viennacl::linalg::inner_prod(Aq, q_i);
+      Aq -= alpha * q_i;
+
+      if (i > 0)
+      {
+        viennacl::vector_base<NumericT> q_iminus1(Q.handle(), Q.size1(), (i-1) * Q.internal_size1(), 1);
+        Aq -= beta * q_iminus1;
+
+        // Extra measures for improved numerical stability?
+        if (tag.method() == lanczos_tag::full_reorthogonalization)
+        {
+          // Gram-Schmidt (re-)orthogonalization:
+          // TODO: Reuse fast (pipelined) routines from GMRES or GEMV
+          for (vcl_size_t j = 0; j < i; j++)
+          {
+            viennacl::vector_base<NumericT> q_j(Q.handle(), Q.size1(), j * Q.internal_size1(), 1);
+            NumericT inner_rq = viennacl::linalg::inner_prod(Aq, q_j);
+            Aq -= inner_rq * q_j;
+          }
+        }
+      }
+
+      // normalize Aq and add to Krylov basis at column i+1 in Q:
+      beta = viennacl::linalg::norm_2(Aq);
+      viennacl::vector_base<NumericT> q_iplus1(Q.handle(), Q.size1(), (i+1) * Q.internal_size1(), 1);
+      q_iplus1 = Aq / beta;
+
+      alphas.push_back(alpha);
+    }
+
+    //
+    // Step 2: Compute eigenvalues of tridiagonal matrix obtained during Lanczos iterations:
+    //
+    std::vector<NumericT> eigenvalues = bisect(alphas, betas);
+
+    //
+    // Step 3: Compute eigenvectors via inverse iteration. Does not update eigenvalues, so only approximate by nature.
+    //
+    if (compute_eigenvectors)
+    {
+      std::vector<NumericT> eigenvector_tridiag(alphas.size());
+      for (std::size_t i=0; i < tag.num_eigenvalues(); ++i)
+      {
+        // compute eigenvector of tridiagonal matrix via inverse:
+        inverse_iteration(alphas, betas, eigenvalues[eigenvalues.size() - i - 1], eigenvector_tridiag);
+
+        // eigenvector w of full matrix A. Given as w = Q * u, where u is the eigenvector of the tridiagonal matrix
+        viennacl::vector<NumericT> eigenvector_u(eigenvector_tridiag.size());
+        viennacl::copy(eigenvector_tridiag, eigenvector_u);
+
+        viennacl::vector_base<NumericT> eigenvector_A(eigenvectors_A.handle(),
+                                                      eigenvectors_A.size1(),
+                                                      eigenvectors_A.row_major() ? i : i * eigenvectors_A.internal_size1(),
+                                                      eigenvectors_A.row_major() ? eigenvectors_A.internal_size2() : 1);
+        eigenvector_A = viennacl::linalg::prod(project(Q,
+                                                       range(0, Q.size1()),
+                                                       range(0, eigenvector_u.size())),
+                                               eigenvector_u);
+      }
+    }
+
+    return eigenvalues;
+  }
+
+} // end namespace detail
+
+/**
+*   @brief Implementation of the calculation of eigenvalues using lanczos (with and without reorthogonalization).
+*
+*   Implementation of Lanczos with partial reorthogonalization is implemented separately.
+*
+*   @param matrix          The system matrix
+*   @param eigenvectors_A  A dense matrix in which the eigenvectors of A will be stored. Both row- and column-major matrices are supported.
+*   @param tag             Tag with several options for the lanczos algorithm
+*   @param compute_eigenvectors   Boolean flag. If true, eigenvectors are computed. Otherwise the routine returns after calculating eigenvalues.
+*   @return                Returns the n largest eigenvalues (n defined in the lanczos_tag)
+*/
+template<typename MatrixT, typename DenseMatrixT>
+std::vector< typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type >
+eig(MatrixT const & matrix, DenseMatrixT & eigenvectors_A, lanczos_tag const & tag, bool compute_eigenvectors = true)
+{
+  typedef typename viennacl::result_of::value_type<MatrixT>::type           NumericType;
+  typedef typename viennacl::result_of::cpu_value_type<NumericType>::type   CPU_NumericType;
+  typedef typename viennacl::result_of::vector_for_matrix<MatrixT>::type    VectorT;
+
+  viennacl::tools::uniform_random_numbers<CPU_NumericType> random_gen;
+
+  std::vector<CPU_NumericType> eigenvalues;
+  vcl_size_t matrix_size = matrix.size1();
+  VectorT r(matrix_size);
+  std::vector<CPU_NumericType> s(matrix_size);
+
+  for (vcl_size_t i=0; i<s.size(); ++i)
+    s[i] = CPU_NumericType(0.5) + random_gen();
+
+  detail::copy_vec_to_vec(s,r);
+
+  vcl_size_t size_krylov = (matrix_size < tag.krylov_size()) ? matrix_size
+                                                              : tag.krylov_size();
+
+  switch (tag.method())
+  {
+  case lanczos_tag::partial_reorthogonalization:
+    eigenvalues = detail::lanczosPRO(matrix, r, eigenvectors_A, size_krylov, tag, compute_eigenvectors);
+    break;
+  case lanczos_tag::full_reorthogonalization:
+  case lanczos_tag::no_reorthogonalization:
+    eigenvalues = detail::lanczos(matrix, r, eigenvectors_A, size_krylov, tag, compute_eigenvectors);
+    break;
+  }
+
+  std::vector<CPU_NumericType> largest_eigenvalues;
+
+  for (vcl_size_t i = 1; i<=tag.num_eigenvalues(); i++)
+    largest_eigenvalues.push_back(eigenvalues[size_krylov-i]);
+
+
+  return largest_eigenvalues;
+}
+
+
+/**
+*   @brief Implementation of the calculation of eigenvalues using lanczos (with and without reorthogonalization).
+*
+*   Implementation of Lanczos with partial reorthogonalization is implemented separately.
+*
+*   @param matrix        The system matrix
+*   @param tag           Tag with several options for the lanczos algorithm
+*   @return              Returns the n largest eigenvalues (n defined in the lanczos_tag)
+*/
+template<typename MatrixT>
+std::vector< typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type >
+eig(MatrixT const & matrix, lanczos_tag const & tag)
+{
+  typedef typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type  NumericType;
+
+  viennacl::matrix<NumericType> eigenvectors(matrix.size1(), tag.num_eigenvalues());
+  return eig(matrix, eigenvectors, tag, false);
+}
+
+} // end namespace linalg
+} // end namespace viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp
new file mode 100644
index 0000000..0bdd037
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp
@@ -0,0 +1,227 @@
+#ifndef VIENNACL_LINALG_LU_HPP
+#define VIENNACL_LINALG_LU_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/lu.hpp
+    @brief Implementations of LU factorization for row-major and column-major dense matrices.
+*/
+
+#include <algorithm>    //for std::min
+
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+/** @brief LU factorization of a row-major dense matrix.
+*
+* @param A    The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+*/
+template<typename NumericT>
+void lu_factorize(matrix<NumericT, viennacl::row_major> & A)
+{
+  typedef matrix<NumericT, viennacl::row_major>  MatrixType;
+
+  vcl_size_t max_block_size = 32;
+  vcl_size_t num_blocks = (A.size2() - 1) / max_block_size + 1;
+  std::vector<NumericT> temp_buffer(A.internal_size2() * max_block_size);
+
+  // Iterate over panels
+  for (vcl_size_t panel_id = 0; panel_id < num_blocks; ++panel_id)
+  {
+    vcl_size_t row_start = panel_id * max_block_size;
+    vcl_size_t current_block_size = std::min<vcl_size_t>(A.size1() - row_start, max_block_size);
+
+    viennacl::range     block_range(row_start, row_start + current_block_size);
+    viennacl::range remainder_range(row_start + current_block_size, A.size1());
+
+    //
+    // Perform LU factorization on panel:
+    //
+
+
+    // Read from matrix to buffer:
+    viennacl::backend::memory_read(A.handle(),
+                                   sizeof(NumericT) * row_start          * A.internal_size2(),
+                                   sizeof(NumericT) * current_block_size * A.internal_size2(),
+                                   &(temp_buffer[0]));
+
+    // Factorize (kij-version):
+    for (vcl_size_t k=0; k < current_block_size - 1; ++k)
+    {
+      for (vcl_size_t i=k+1; i < current_block_size; ++i)
+      {
+        temp_buffer[row_start + i * A.internal_size2() + k] /= temp_buffer[row_start + k * A.internal_size2() + k];  // write l_ik
+
+        NumericT l_ik = temp_buffer[row_start + i * A.internal_size2() + k];
+
+        for (vcl_size_t j = row_start + k + 1; j < A.size1(); ++j)
+          temp_buffer[i * A.internal_size2() + j] -= l_ik * temp_buffer[k * A.internal_size2() + j];  // l_ik * a_kj
+      }
+    }
+
+    // Write back:
+    viennacl::backend::memory_write(A.handle(),
+                                    sizeof(NumericT) * row_start          * A.internal_size2(),
+                                    sizeof(NumericT) * current_block_size * A.internal_size2(),
+                                    &(temp_buffer[0]));
+
+    if (remainder_range.size() > 0)
+    {
+      //
+      // Compute L_12 = [ (U_11)^{T}^{-1} A_{21}^T ]^T
+      //
+      viennacl::matrix_range<MatrixType> U_11(A, block_range,     block_range);
+      viennacl::matrix_range<MatrixType> A_21(A, remainder_range, block_range);
+      viennacl::linalg::inplace_solve(trans(U_11), trans(A_21), viennacl::linalg::lower_tag());
+
+      //
+      // Update remainder of A
+      //
+      viennacl::matrix_range<MatrixType> L_21(A, remainder_range, block_range);
+      viennacl::matrix_range<MatrixType> U_12(A, block_range,     remainder_range);
+      viennacl::matrix_range<MatrixType> A_22(A, remainder_range, remainder_range);
+
+      A_22 -= viennacl::linalg::prod(L_21, U_12);
+    }
+  }
+
+}
+
+
+/** @brief LU factorization of a column-major dense matrix.
+*
+* @param A    The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+*/
+template<typename NumericT>
+void lu_factorize(matrix<NumericT, viennacl::column_major> & A)
+{
+  typedef matrix<NumericT, viennacl::column_major>  MatrixType;
+
+  vcl_size_t max_block_size = 32;
+  vcl_size_t num_blocks = (A.size1() - 1) / max_block_size + 1;
+  std::vector<NumericT> temp_buffer(A.internal_size1() * max_block_size);
+
+  // Iterate over panels
+  for (vcl_size_t panel_id = 0; panel_id < num_blocks; ++panel_id)
+  {
+    vcl_size_t col_start = panel_id * max_block_size;
+    vcl_size_t current_block_size = std::min<vcl_size_t>(A.size1() - col_start, max_block_size);
+
+    viennacl::range     block_range(col_start, col_start + current_block_size);
+    viennacl::range remainder_range(col_start + current_block_size, A.size1());
+
+    //
+    // Perform LU factorization on panel:
+    //
+
+
+    // Read from matrix to buffer:
+    viennacl::backend::memory_read(A.handle(),
+                                   sizeof(NumericT) * col_start          * A.internal_size1(),
+                                   sizeof(NumericT) * current_block_size * A.internal_size1(),
+                                   &(temp_buffer[0]));
+
+    // Factorize (kji-version):
+    for (vcl_size_t k=0; k < current_block_size; ++k)
+    {
+      NumericT a_kk = temp_buffer[col_start + k + k * A.internal_size1()];
+      for (vcl_size_t i=col_start+k+1; i < A.size1(); ++i)
+        temp_buffer[i + k * A.internal_size1()] /= a_kk;  // write l_ik
+
+      for (vcl_size_t j=k+1; j < current_block_size; ++j)
+      {
+        NumericT a_kj = temp_buffer[col_start + k + j * A.internal_size1()];
+        for (vcl_size_t i=col_start+k+1; i < A.size1(); ++i)
+          temp_buffer[i + j * A.internal_size1()] -= temp_buffer[i + k * A.internal_size1()] * a_kj;  // l_ik * a_kj
+      }
+    }
+
+    // Write back:
+    viennacl::backend::memory_write(A.handle(),
+                                    sizeof(NumericT) * col_start          * A.internal_size1(),
+                                    sizeof(NumericT) * current_block_size * A.internal_size1(),
+                                    &(temp_buffer[0]));
+
+    if (remainder_range.size() > 0)
+    {
+      //
+      // Compute U_12:
+      //
+      viennacl::matrix_range<MatrixType> L_11(A, block_range,     block_range);
+      viennacl::matrix_range<MatrixType> A_12(A, block_range, remainder_range);
+      viennacl::linalg::inplace_solve(L_11, A_12, viennacl::linalg::unit_lower_tag());
+
+      //
+      // Update remainder of A
+      //
+      viennacl::matrix_range<MatrixType> L_21(A, remainder_range, block_range);
+      viennacl::matrix_range<MatrixType> U_12(A, block_range,     remainder_range);
+      viennacl::matrix_range<MatrixType> A_22(A, remainder_range, remainder_range);
+
+      A_22 -= viennacl::linalg::prod(L_21, U_12);
+    }
+
+  }
+
+}
+
+
+//
+// Convenience layer:
+//
+
+/** @brief LU substitution for the system LU = rhs.
+*
+* @param A    The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+* @param B    The matrix of load vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename F1, typename F2, unsigned int AlignmentV1, unsigned int AlignmentV2>
+void lu_substitute(matrix<NumericT, F1, AlignmentV1> const & A,
+                   matrix<NumericT, F2, AlignmentV2> & B)
+{
+  assert(A.size1() == A.size2() && bool("Matrix must be square"));
+  assert(A.size1() == B.size1() && bool("Matrix must be square"));
+  inplace_solve(A, B, unit_lower_tag());
+  inplace_solve(A, B, upper_tag());
+}
+
+/** @brief LU substitution for the system LU = rhs.
+*
+* @param A      The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+* @param vec    The load vector, where the solution is directly written to
+*/
+template<typename NumericT, typename F, unsigned int MatAlignmentV, unsigned int VecAlignmentV>
+void lu_substitute(matrix<NumericT, F, MatAlignmentV> const & A,
+                   vector<NumericT, VecAlignmentV> & vec)
+{
+  assert(A.size1() == A.size2() && bool("Matrix must be square"));
+  inplace_solve(A, vec, unit_lower_tag());
+  inplace_solve(A, vec, upper_tag());
+}
+
+}
+}
+
+#endif


[41/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/forwards.h
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/forwards.h b/native-viennaCL/src/main/cpp/viennacl/forwards.h
new file mode 100644
index 0000000..23a4580
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/forwards.h
@@ -0,0 +1,1032 @@
+#ifndef VIENNACL_FORWARDS_H
+#define VIENNACL_FORWARDS_H
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/forwards.h
+    @brief This file provides the forward declarations for the main types used within ViennaCL
+*/
+
+/**
+ @mainpage Main Page
+
+ Here you can find all the documentation on how to use the GPU-accelerated linear algebra library ViennaCL.
+ The formerly separate \ref usermanual "user manual" is no longer available as a standalone PDF, but all integrated into the HTML-based documentation.
+ Please use the navigation panel on the left to access the desired information.
+
+ Quick links:
+     - \ref manual-installation "Installation and building the examples"
+     - \ref manual-types        "Basic types"
+     - \ref manual-operations   "Basic operations"
+     - \ref manual-algorithms   "Algorithms"
+
+
+ -----------------------------------
+ \htmlonly
+ <div style="align: right; width: 100%">
+ <a href="http://www.tuwien.ac.at/"><img src="tuwien.png"></a>
+ <a href="http://www.iue.tuwien.ac.at/"><img src="iue.png"></a>
+ <a href="http://www.asc.tuwien.ac.at/"><img src="asc.png"></a>
+ </div>
+ \endhtmlonly
+*/
+
+
+//compatibility defines:
+#ifdef VIENNACL_HAVE_UBLAS
+  #define VIENNACL_WITH_UBLAS
+#endif
+
+#ifdef VIENNACL_HAVE_EIGEN
+  #define VIENNACL_WITH_EIGEN
+#endif
+
+#ifdef VIENNACL_HAVE_MTL4
+  #define VIENNACL_WITH_MTL4
+#endif
+
+#include <cstddef>
+#include <cassert>
+#include <string>
+#include <stdexcept>
+
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/version.hpp"
+
+/** @brief Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them. */
+namespace viennacl
+{
+  typedef std::size_t                                       vcl_size_t;
+  typedef std::ptrdiff_t                                    vcl_ptrdiff_t;
+
+
+
+  /** @brief A tag class representing assignment */
+  struct op_assign {};
+  /** @brief A tag class representing inplace addition */
+  struct op_inplace_add {};
+  /** @brief A tag class representing inplace subtraction */
+  struct op_inplace_sub {};
+
+  /** @brief A tag class representing addition */
+  struct op_add {};
+  /** @brief A tag class representing subtraction */
+  struct op_sub {};
+  /** @brief A tag class representing multiplication by a scalar */
+  struct op_mult {};
+  /** @brief A tag class representing matrix-vector products and element-wise multiplications*/
+  struct op_prod {};
+  /** @brief A tag class representing matrix-matrix products */
+  struct op_mat_mat_prod {};
+  /** @brief A tag class representing division */
+  struct op_div {};
+  /** @brief A tag class representing the power function */
+  struct op_pow {};
+
+  /** @brief A tag class representing equality */
+ struct op_eq {};
+ /** @brief A tag class representing inequality */
+ struct op_neq {};
+ /** @brief A tag class representing greater-than */
+ struct op_greater {};
+ /** @brief A tag class representing less-than */
+ struct op_less {};
+ /** @brief A tag class representing greater-than-or-equal-to */
+ struct op_geq {};
+ /** @brief A tag class representing less-than-or-equal-to */
+ struct op_leq {};
+
+  /** @brief A tag class representing the summation of a vector */
+  struct op_sum {};
+
+  /** @brief A tag class representing the summation of all rows of a matrix */
+  struct op_row_sum {};
+
+  /** @brief A tag class representing the summation of all columns of a matrix */
+  struct op_col_sum {};
+
+  /** @brief A tag class representing element-wise casting operations on vectors and matrices */
+  template<typename OP>
+  struct op_element_cast {};
+
+  /** @brief A tag class representing element-wise binary operations (like multiplication) on vectors or matrices */
+  template<typename OP>
+  struct op_element_binary {};
+
+  /** @brief A tag class representing element-wise unary operations (like sin()) on vectors or matrices */
+  template<typename OP>
+  struct op_element_unary {};
+
+  /** @brief A tag class representing the modulus function for integers */
+  struct op_abs {};
+  /** @brief A tag class representing the acos() function */
+  struct op_acos {};
+  /** @brief A tag class representing the asin() function */
+  struct op_asin {};
+  /** @brief A tag class for representing the argmax() function */
+  struct op_argmax {};
+  /** @brief A tag class for representing the argmin() function */
+  struct op_argmin {};
+  /** @brief A tag class representing the atan() function */
+  struct op_atan {};
+  /** @brief A tag class representing the atan2() function */
+  struct op_atan2 {};
+  /** @brief A tag class representing the ceil() function */
+  struct op_ceil {};
+  /** @brief A tag class representing the cos() function */
+  struct op_cos {};
+  /** @brief A tag class representing the cosh() function */
+  struct op_cosh {};
+  /** @brief A tag class representing the exp() function */
+  struct op_exp {};
+  /** @brief A tag class representing the fabs() function */
+  struct op_fabs {};
+  /** @brief A tag class representing the fdim() function */
+  struct op_fdim {};
+  /** @brief A tag class representing the floor() function */
+  struct op_floor {};
+  /** @brief A tag class representing the fmax() function */
+  struct op_fmax {};
+  /** @brief A tag class representing the fmin() function */
+  struct op_fmin {};
+  /** @brief A tag class representing the fmod() function */
+  struct op_fmod {};
+  /** @brief A tag class representing the log() function */
+  struct op_log {};
+  /** @brief A tag class representing the log10() function */
+  struct op_log10 {};
+  /** @brief A tag class representing the sin() function */
+  struct op_sin {};
+  /** @brief A tag class representing the sinh() function */
+  struct op_sinh {};
+  /** @brief A tag class representing the sqrt() function */
+  struct op_sqrt {};
+  /** @brief A tag class representing the tan() function */
+  struct op_tan {};
+  /** @brief A tag class representing the tanh() function */
+  struct op_tanh {};
+
+  /** @brief A tag class representing the (off-)diagonal of a matrix */
+  struct op_matrix_diag {};
+
+  /** @brief A tag class representing a matrix given by a vector placed on a certain (off-)diagonal */
+  struct op_vector_diag {};
+
+  /** @brief A tag class representing the extraction of a matrix row to a vector */
+  struct op_row {};
+
+  /** @brief A tag class representing the extraction of a matrix column to a vector */
+  struct op_column {};
+
+  /** @brief A tag class representing inner products of two vectors */
+  struct op_inner_prod {};
+
+  /** @brief A tag class representing the 1-norm of a vector */
+  struct op_norm_1 {};
+
+  /** @brief A tag class representing the 2-norm of a vector */
+  struct op_norm_2 {};
+
+  /** @brief A tag class representing the inf-norm of a vector */
+  struct op_norm_inf {};
+
+  /** @brief A tag class representing the maximum of a vector */
+  struct op_max {};
+
+  /** @brief A tag class representing the minimum of a vector */
+  struct op_min {};
+
+
+  /** @brief A tag class representing the Frobenius-norm of a matrix */
+  struct op_norm_frobenius {};
+
+  /** @brief A tag class representing transposed matrices */
+  struct op_trans {};
+
+  /** @brief A tag class representing sign flips (for scalars only. Vectors and matrices use the standard multiplication by the scalar -1.0) */
+  struct op_flip_sign {};
+
+  //forward declaration of basic types:
+  template<class TYPE>
+  class scalar;
+
+  template<typename LHS, typename RHS, typename OP>
+  class scalar_expression;
+
+  template<typename SCALARTYPE>
+  class entry_proxy;
+
+  template<typename SCALARTYPE>
+  class const_entry_proxy;
+
+  template<typename LHS, typename RHS, typename OP>
+  class vector_expression;
+
+  template<class SCALARTYPE, unsigned int ALIGNMENT>
+  class vector_iterator;
+
+  template<class SCALARTYPE, unsigned int ALIGNMENT>
+  class const_vector_iterator;
+
+  template<typename SCALARTYPE>
+  class implicit_vector_base;
+
+  template<typename SCALARTYPE>
+  struct zero_vector;
+
+  template<typename SCALARTYPE>
+  struct unit_vector;
+
+  template<typename SCALARTYPE>
+  struct one_vector;
+
+  template<typename SCALARTYPE>
+  struct scalar_vector;
+
+  template<class SCALARTYPE, typename SizeType = vcl_size_t, typename DistanceType = vcl_ptrdiff_t>
+  class vector_base;
+
+  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+  class vector;
+
+  template<typename ScalarT>
+  class vector_tuple;
+
+  //the following forwards are needed for GMRES
+  template<typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+  void copy(CPU_ITERATOR const & cpu_begin,
+            CPU_ITERATOR const & cpu_end,
+            vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin);
+
+  template<typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
+  void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
+            const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
+            vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin);
+
+  template<typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
+  void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
+            const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
+            const_vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin);
+
+  template<typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+  void fast_copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
+                 const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
+                 CPU_ITERATOR cpu_begin );
+
+  template<typename CPU_ITERATOR, typename SCALARTYPE, unsigned int ALIGNMENT>
+  void fast_copy(CPU_ITERATOR const & cpu_begin,
+                  CPU_ITERATOR const & cpu_end,
+                  vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin);
+
+
+  /** @brief Tag class for indicating row-major layout of a matrix. Not passed to the matrix directly, see row_major type. */
+  struct row_major_tag {};
+  /** @brief Tag class for indicating column-major layout of a matrix. Not passed to the matrix directly, see row_major type. */
+  struct column_major_tag {};
+
+  /** @brief A tag for row-major storage of a dense matrix. */
+  struct row_major
+  {
+    typedef row_major_tag         orientation_category;
+
+    /** @brief Returns the memory offset for entry (i,j) of a dense matrix.
+    *
+    * @param i   row index
+    * @param j   column index
+    * @param num_cols  number of entries per column (including alignment)
+    */
+    static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t /* num_rows */, vcl_size_t num_cols)
+    {
+      return i * num_cols + j;
+    }
+  };
+
+  /** @brief A tag for column-major storage of a dense matrix. */
+  struct column_major
+  {
+    typedef column_major_tag         orientation_category;
+
+    /** @brief Returns the memory offset for entry (i,j) of a dense matrix.
+    *
+    * @param i   row index
+    * @param j   column index
+    * @param num_rows  number of entries per row (including alignment)
+    */
+    static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t num_rows, vcl_size_t /* num_cols */)
+    {
+      return i + j * num_rows;
+    }
+  };
+
+  struct row_iteration;
+  struct col_iteration;
+
+  template<typename LHS, typename RHS, typename OP>
+  class matrix_expression;
+
+  class context;
+
+  enum memory_types
+  {
+    MEMORY_NOT_INITIALIZED
+    , MAIN_MEMORY
+    , OPENCL_MEMORY
+    , CUDA_MEMORY
+  };
+
+  namespace backend
+  {
+    class mem_handle;
+  }
+
+  //
+  // Matrix types:
+  //
+  static const vcl_size_t dense_padding_size = 128;
+
+  /** @brief A dense matrix class
+  *
+  * @tparam SCALARTYPE   The underlying scalar type (either float or double)
+  * @tparam ALIGNMENT   The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+  */
+  template<typename ROWCOL, typename MATRIXTYPE>
+  class matrix_iterator;
+
+  template<class SCALARTYPE, typename SizeType = vcl_size_t, typename DistanceType = vcl_ptrdiff_t>
+  class matrix_base;
+
+  template<class SCALARTYPE, typename F = row_major, unsigned int ALIGNMENT = 1>
+  class matrix;
+
+  template<typename SCALARTYPE>
+  class implicit_matrix_base;
+
+  template<class SCALARTYPE>
+  class identity_matrix;
+
+  template<class SCALARTYPE>
+  class zero_matrix;
+
+  template<class SCALARTYPE>
+  class scalar_matrix;
+
+  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+  class compressed_matrix;
+
+  template<class SCALARTYPE>
+  class compressed_compressed_matrix;
+
+
+  template<class SCALARTYPE, unsigned int ALIGNMENT = 128>
+  class coordinate_matrix;
+
+  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+  class ell_matrix;
+
+  template<typename ScalarT, typename IndexT = unsigned int>
+  class sliced_ell_matrix;
+
+  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+  class hyb_matrix;
+
+  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+  class circulant_matrix;
+
+  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+  class hankel_matrix;
+
+  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+  class toeplitz_matrix;
+
+  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+  class vandermonde_matrix;
+
+  //
+  // Proxies:
+  //
+  template<typename SizeType = vcl_size_t, typename DistanceType = std::ptrdiff_t>
+  class basic_range;
+
+  typedef basic_range<>  range;
+
+  template<typename SizeType = vcl_size_t, typename DistanceType = std::ptrdiff_t>
+  class basic_slice;
+
+  typedef basic_slice<>  slice;
+
+  template<typename VectorType>
+  class vector_range;
+
+  template<typename VectorType>
+  class vector_slice;
+
+  template<typename MatrixType>
+  class matrix_range;
+
+  template<typename MatrixType>
+  class matrix_slice;
+
+
+  /** @brief Helper struct for checking whether a type is a host scalar type (e.g. float, double) */
+  template<typename T>
+  struct is_cpu_scalar
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper struct for checking whether a type is a viennacl::scalar<> */
+  template<typename T>
+  struct is_scalar
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper struct for checking whether a type represents a sign flip on a viennacl::scalar<> */
+  template<typename T>
+  struct is_flip_sign_scalar
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper struct for checking whether the provided type represents a scalar (either host, from ViennaCL, or a flip-sign proxy) */
+  template<typename T>
+  struct is_any_scalar
+  {
+    enum { value = (is_scalar<T>::value || is_cpu_scalar<T>::value || is_flip_sign_scalar<T>::value )};
+  };
+
+  /** @brief Checks for a type being either vector_base or implicit_vector_base */
+  template<typename T>
+  struct is_any_vector { enum { value = 0 }; };
+
+  /** @brief Checks for either matrix_base or implicit_matrix_base */
+  template<typename T>
+  struct is_any_dense_matrix { enum { value = 0 }; };
+
+  /** @brief Helper class for checking whether a matrix has a row-major layout. */
+  template<typename T>
+  struct is_row_major
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is a compressed_matrix (CSR format) */
+  template<typename T>
+  struct is_compressed_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is a coordinate_matrix (COO format) */
+  template<typename T>
+  struct is_coordinate_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is an ell_matrix (ELL format) */
+  template<typename T>
+  struct is_ell_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is a sliced_ell_matrix (SELL-C-\f$ \sigma \f$ format) */
+  template<typename T>
+  struct is_sliced_ell_matrix
+  {
+    enum { value = false };
+  };
+
+
+  /** @brief Helper class for checking whether a matrix is a hyb_matrix (hybrid format: ELL plus CSR) */
+  template<typename T>
+  struct is_hyb_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether the provided type is one of the sparse matrix types (compressed_matrix, coordinate_matrix, etc.) */
+  template<typename T>
+  struct is_any_sparse_matrix
+  {
+    enum { value = false };
+  };
+
+
+  /** @brief Helper class for checking whether a matrix is a circulant matrix */
+  template<typename T>
+  struct is_circulant_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is a Hankel matrix */
+  template<typename T>
+  struct is_hankel_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is a Toeplitz matrix */
+  template<typename T>
+  struct is_toeplitz_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is a Vandermonde matrix */
+  template<typename T>
+  struct is_vandermonde_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether the provided type is any of the dense structured matrix types (circulant, Hankel, etc.) */
+  template<typename T>
+  struct is_any_dense_structured_matrix
+  {
+    enum { value = viennacl::is_circulant_matrix<T>::value || viennacl::is_hankel_matrix<T>::value || viennacl::is_toeplitz_matrix<T>::value || viennacl::is_vandermonde_matrix<T>::value };
+  };
+
+
+
+
+  /** @brief Exception class in case of memory errors */
+  class memory_exception : public std::exception
+  {
+  public:
+    memory_exception() : message_() {}
+    memory_exception(std::string message) : message_("ViennaCL: Internal memory error: " + message) {}
+
+    virtual const char* what() const throw() { return message_.c_str(); }
+
+    virtual ~memory_exception() throw() {}
+  private:
+    std::string message_;
+  };
+
+  class cuda_not_available_exception : public std::exception
+  {
+  public:
+    cuda_not_available_exception() : message_("ViennaCL was compiled without CUDA support, but CUDA functionality required for this operation.") {}
+
+    virtual const char* what() const throw() { return message_.c_str(); }
+
+    virtual ~cuda_not_available_exception() throw() {}
+  private:
+    std::string message_;
+  };
+
+  class zero_on_diagonal_exception : public std::runtime_error
+  {
+  public:
+    zero_on_diagonal_exception(std::string const & what_arg) : std::runtime_error(what_arg) {}
+  };
+
+  class unknown_norm_exception : public std::runtime_error
+  {
+  public:
+    unknown_norm_exception(std::string const & what_arg) : std::runtime_error(what_arg) {}
+  };
+
+
+
+  namespace tools
+  {
+    //helper for matrix row/col iterators
+    //must be specialized for every viennacl matrix type
+    /** @brief Helper class for incrementing an iterator in a dense matrix. */
+    template<typename ROWCOL, typename MATRIXTYPE>
+    struct MATRIX_ITERATOR_INCREMENTER
+    {
+      typedef typename MATRIXTYPE::ERROR_SPECIALIZATION_FOR_THIS_MATRIX_TYPE_MISSING          ErrorIndicator;
+
+      static void apply(const MATRIXTYPE & /*mat*/, unsigned int & /*row*/, unsigned int & /*col*/) {}
+    };
+  }
+
+  namespace linalg
+  {
+#if !defined(_MSC_VER) || defined(__CUDACC__)
+
+    template<class SCALARTYPE, unsigned int ALIGNMENT>
+    void convolve_i(viennacl::vector<SCALARTYPE, ALIGNMENT>& input1,
+                    viennacl::vector<SCALARTYPE, ALIGNMENT>& input2,
+                    viennacl::vector<SCALARTYPE, ALIGNMENT>& output);
+
+    template<typename T>
+    viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_prod> >
+    element_prod(vector_base<T> const & v1, vector_base<T> const & v2);
+
+    template<typename T>
+    viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_div> >
+    element_div(vector_base<T> const & v1, vector_base<T> const & v2);
+
+
+
+    template<typename T>
+    void inner_prod_impl(vector_base<T> const & vec1,
+                         vector_base<T> const & vec2,
+                         scalar<T> & result);
+
+    template<typename LHS, typename RHS, typename OP, typename T>
+    void inner_prod_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
+                         vector_base<T> const & vec2,
+                         scalar<T> & result);
+
+    template<typename T, typename LHS, typename RHS, typename OP>
+    void inner_prod_impl(vector_base<T> const & vec1,
+                         viennacl::vector_expression<LHS, RHS, OP> const & vec2,
+                         scalar<T> & result);
+
+    template<typename LHS1, typename RHS1, typename OP1,
+              typename LHS2, typename RHS2, typename OP2, typename T>
+    void inner_prod_impl(viennacl::vector_expression<LHS1, RHS1, OP1> const & vec1,
+                         viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
+                         scalar<T> & result);
+
+    ///////////////////////////
+
+    template<typename T>
+    void inner_prod_cpu(vector_base<T> const & vec1,
+                        vector_base<T> const & vec2,
+                        T & result);
+
+    template<typename LHS, typename RHS, typename OP, typename T>
+    void inner_prod_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
+                        vector_base<T> const & vec2,
+                        T & result);
+
+    template<typename T, typename LHS, typename RHS, typename OP>
+    void inner_prod_cpu(vector_base<T> const & vec1,
+                        viennacl::vector_expression<LHS, RHS, OP> const & vec2,
+                        T & result);
+
+    template<typename LHS1, typename RHS1, typename OP1,
+              typename LHS2, typename RHS2, typename OP2, typename S3>
+    void inner_prod_cpu(viennacl::vector_expression<LHS1, RHS1, OP1> const & vec1,
+                        viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
+                        S3 & result);
+
+
+
+    //forward definition of norm_1_impl function
+    template<typename T>
+    void norm_1_impl(vector_base<T> const & vec, scalar<T> & result);
+
+    template<typename LHS, typename RHS, typename OP, typename T>
+    void norm_1_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                     scalar<T> & result);
+
+
+    template<typename T>
+    void norm_1_cpu(vector_base<T> const & vec,
+                    T & result);
+
+    template<typename LHS, typename RHS, typename OP, typename S2>
+    void norm_1_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                    S2 & result);
+
+    //forward definition of norm_2_impl function
+    template<typename T>
+    void norm_2_impl(vector_base<T> const & vec, scalar<T> & result);
+
+    template<typename LHS, typename RHS, typename OP, typename T>
+    void norm_2_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                     scalar<T> & result);
+
+    template<typename T>
+    void norm_2_cpu(vector_base<T> const & vec, T & result);
+
+    template<typename LHS, typename RHS, typename OP, typename S2>
+    void norm_2_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                    S2 & result);
+
+
+    //forward definition of norm_inf_impl function
+    template<typename T>
+    void norm_inf_impl(vector_base<T> const & vec, scalar<T> & result);
+
+    template<typename LHS, typename RHS, typename OP, typename T>
+    void norm_inf_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                      scalar<T> & result);
+
+
+    template<typename T>
+    void norm_inf_cpu(vector_base<T> const & vec, T & result);
+
+    template<typename LHS, typename RHS, typename OP, typename S2>
+    void norm_inf_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                      S2 & result);
+
+    //forward definition of max()-related functions
+    template<typename T>
+    void max_impl(vector_base<T> const & vec, scalar<T> & result);
+
+    template<typename LHS, typename RHS, typename OP, typename T>
+    void max_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                  scalar<T> & result);
+
+
+    template<typename T>
+    void max_cpu(vector_base<T> const & vec, T & result);
+
+    template<typename LHS, typename RHS, typename OP, typename S2>
+    void max_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                 S2 & result);
+
+    //forward definition of min()-related functions
+    template<typename T>
+    void min_impl(vector_base<T> const & vec, scalar<T> & result);
+
+    template<typename LHS, typename RHS, typename OP, typename T>
+    void min_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                  scalar<T> & result);
+
+
+    template<typename T>
+    void min_cpu(vector_base<T> const & vec, T & result);
+
+    template<typename LHS, typename RHS, typename OP, typename S2>
+    void min_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                 S2 & result);
+
+    //forward definition of sum()-related functions
+    template<typename T>
+    void sum_impl(vector_base<T> const & vec, scalar<T> & result);
+
+    template<typename LHS, typename RHS, typename OP, typename T>
+    void sum_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                  scalar<T> & result);
+
+
+    template<typename T>
+    void sum_cpu(vector_base<T> const & vec, T & result);
+
+    template<typename LHS, typename RHS, typename OP, typename S2>
+    void sum_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                 S2 & result);
+
+
+    // forward definition of frobenius norm:
+    template<typename T>
+    void norm_frobenius_impl(matrix_base<T> const & vec, scalar<T> & result);
+
+    template<typename T>
+    void norm_frobenius_cpu(matrix_base<T> const & vec, T & result);
+
+
+    template<typename T>
+    vcl_size_t index_norm_inf(vector_base<T> const & vec);
+
+    template<typename LHS, typename RHS, typename OP>
+    vcl_size_t index_norm_inf(viennacl::vector_expression<LHS, RHS, OP> const & vec);
+
+    //forward definition of prod_impl functions
+
+    template<typename NumericT>
+    void prod_impl(const matrix_base<NumericT> & mat,
+                   const vector_base<NumericT> & vec,
+                         vector_base<NumericT> & result);
+
+    template<typename NumericT>
+    void prod_impl(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & mat_trans,
+                   const vector_base<NumericT> & vec,
+                         vector_base<NumericT> & result);
+
+    template<typename SparseMatrixType, class SCALARTYPE, unsigned int ALIGNMENT>
+    typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                  vector_expression<const SparseMatrixType,
+                                                    const vector<SCALARTYPE, ALIGNMENT>,
+                                                    op_prod >
+                                 >::type
+    prod_impl(const SparseMatrixType & mat,
+              const vector<SCALARTYPE, ALIGNMENT> & vec);
+
+    // forward definition of summation routines for matrices:
+
+    template<typename NumericT>
+    void row_sum_impl(const matrix_base<NumericT> & A,
+                            vector_base<NumericT> & result);
+
+    template<typename NumericT>
+    void column_sum_impl(const matrix_base<NumericT> & A,
+                               vector_base<NumericT> & result);
+
+#endif
+
+    namespace detail
+    {
+      enum row_info_types
+      {
+        SPARSE_ROW_NORM_INF = 0,
+        SPARSE_ROW_NORM_1,
+        SPARSE_ROW_NORM_2,
+        SPARSE_ROW_DIAGONAL
+      };
+
+    }
+
+
+    /** @brief A tag class representing a lower triangular matrix */
+    struct lower_tag
+    {
+      static const char * name() { return "lower"; }
+    };      //lower triangular matrix
+    /** @brief A tag class representing an upper triangular matrix */
+    struct upper_tag
+    {
+      static const char * name() { return "upper"; }
+    };      //upper triangular matrix
+    /** @brief A tag class representing a lower triangular matrix with unit diagonal*/
+    struct unit_lower_tag
+    {
+      static const char * name() { return "unit_lower"; }
+    }; //unit lower triangular matrix
+    /** @brief A tag class representing an upper triangular matrix with unit diagonal*/
+    struct unit_upper_tag
+    {
+      static const char * name() { return "unit_upper"; }
+    }; //unit upper triangular matrix
+
+    //preconditioner tags
+    class ilut_tag;
+
+    /** @brief A tag class representing the use of no preconditioner */
+    class no_precond
+    {
+      public:
+        template<typename VectorType>
+        void apply(VectorType &) const {}
+    };
+
+
+  } //namespace linalg
+
+  //
+  // More namespace comments to follow:
+  //
+
+  /** @brief Namespace providing routines for handling the different memory domains. */
+  namespace backend
+  {
+    /** @brief Provides implementations for handling memory buffers in CPU RAM. */
+    namespace cpu_ram
+    {
+      /** @brief Holds implementation details for handling memory buffers in CPU RAM. Not intended for direct use by library users. */
+      namespace detail {}
+    }
+
+    /** @brief Provides implementations for handling CUDA memory buffers. */
+    namespace cuda
+    {
+      /** @brief Holds implementation details for handling CUDA memory buffers. Not intended for direct use by library users. */
+      namespace detail {}
+    }
+
+    /** @brief Implementation details for the generic memory backend interface. */
+    namespace detail {}
+
+    /** @brief Provides implementations for handling OpenCL memory buffers. */
+    namespace opencl
+    {
+      /** @brief Holds implementation details for handling OpenCL memory buffers. Not intended for direct use by library users. */
+      namespace detail {}
+    }
+  }
+
+
+  /** @brief Holds implementation details for functionality in the main viennacl-namespace. Not intended for direct use by library users. */
+  namespace detail
+  {
+    /** @brief Helper namespace for fast Fourier transforms. Not to be used directly by library users. */
+    namespace fft
+    {
+      /** @brief Helper namespace for fast-Fourier transformation. Deprecated. */
+      namespace FFT_DATA_ORDER {}
+    }
+  }
+
+
+  /** @brief Provides an OpenCL kernel generator. */
+  namespace device_specific
+  {
+    /** @brief Provides the implementation for tuning the kernels for a particular device. */
+    namespace autotune {}
+
+    /** @brief Contains implementation details of the kernel generator. */
+    namespace detail {}
+
+    /** @brief Namespace holding the various device-specific parameters for generating the best kernels. */
+    namespace profiles {}
+
+    /** @brief Contains various helper routines for kernel generation. */
+    namespace utils {}
+  }
+
+  /** @brief Provides basic input-output functionality. */
+  namespace io
+  {
+    /** @brief Implementation details for IO functionality. Usually not of interest for a library user. */
+    namespace detail {}
+
+    /** @brief Namespace holding the various XML tag definitions for the kernel parameter tuning facility. */
+    namespace tag {}
+
+    /** @brief Namespace holding the various XML strings for the kernel parameter tuning facility. */
+    namespace val {}
+  }
+
+  /** @brief Provides all linear algebra operations which are not covered by operator overloads. */
+  namespace linalg
+  {
+    /** @brief Holds all CUDA compute kernels used by ViennaCL. */
+    namespace cuda
+    {
+      /** @brief Helper functions for the CUDA linear algebra backend. */
+      namespace detail {}
+    }
+
+    /** @brief Namespace holding implementation details for linear algebra routines. Usually not of interest for a library user. */
+    namespace detail
+    {
+      /** @brief Implementation namespace for algebraic multigrid preconditioner. */
+      namespace amg {}
+
+      /** @brief Implementation namespace for sparse approximate inverse preconditioner. */
+      namespace spai {}
+    }
+
+    /** @brief Holds all compute kernels with conventional host-based execution (buffers in CPU RAM). */
+    namespace host_based
+    {
+      /** @brief Helper functions for the host-based linear algebra backend. */
+      namespace detail {}
+    }
+
+    /** @brief Namespace containing the OpenCL kernels. Deprecated, will be moved to viennacl::linalg::opencl in future releases. */
+    namespace kernels {}
+
+    /** @brief Holds all routines providing OpenCL linear algebra operations. */
+    namespace opencl
+    {
+      /** @brief Helper functions for OpenCL-accelerated linear algebra operations. */
+      namespace detail {}
+
+      /** @brief Contains the OpenCL kernel generation functions for a predefined set of functionality. */
+      namespace kernels
+      {
+        /** @brief Implementation details for the predefined OpenCL kernels. */
+        namespace detail {}
+      }
+    }
+  }
+
+  /** @brief OpenCL backend. Manages platforms, contexts, buffers, kernels, etc. */
+  namespace ocl {}
+
+  /** @brief Namespace containing many meta-functions. */
+  namespace result_of {}
+
+  /** @brief Namespace for various tools used within ViennaCL. */
+  namespace tools
+  {
+    /** @brief Contains implementation details for the tools. Usually not of interest for the library user. */
+    namespace detail {}
+  }
+
+  /** @brief Namespace providing traits-information as well as generic wrappers to common routines for vectors and matrices such as size() or clear() */
+  namespace traits {}
+
+  /** @brief Contains the scheduling functionality which allows for dynamic kernel generation as well as the fusion of multiple statements into a single kernel. */
+  namespace scheduler
+  {
+    /** @brief Implementation details for the scheduler */
+    namespace detail {}
+
+    /** @brief Helper metafunctions used for the scheduler */
+    namespace result_of {}
+  }
+
+} //namespace viennacl
+
+#endif
+
+/*@}*/

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/hankel_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/hankel_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/hankel_matrix.hpp
new file mode 100644
index 0000000..084e6c8
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/hankel_matrix.hpp
@@ -0,0 +1,343 @@
+#ifndef VIENNACL_HANKEL_MATRIX_HPP
+#define VIENNACL_HANKEL_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file hankel_matrix.hpp
+    @brief Implementation of the hankel_matrix class for efficient manipulation of Hankel matrices.  Experimental.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/ocl/backend.hpp"
+
+#include "viennacl/toeplitz_matrix.hpp"
+#include "viennacl/fft.hpp"
+
+#include "viennacl/linalg/hankel_matrix_operations.hpp"
+
+namespace viennacl
+{
+/** @brief A Hankel matrix class
+  *
+  * @tparam NumericT   The underlying scalar type (either float or double)
+  * @tparam AlignmentV    The internal memory size is given by (size()/AlignmentV + 1) * AlignmentV. AlignmentV must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+  */
+template<class NumericT, unsigned int AlignmentV>
+class hankel_matrix
+{
+public:
+  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
+
+  /**
+       * @brief The default constructor. Does not allocate any memory.
+       *
+       */
+  explicit hankel_matrix() {}
+
+  /**
+       * @brief         Creates the matrix with the given size
+       *
+       * @param rows      Number of rows of the matrix
+       * @param cols      Number of columns of the matrix
+       */
+  explicit hankel_matrix(vcl_size_t rows, vcl_size_t cols) : elements_(rows, cols)
+  {
+    assert(rows == cols && bool("Hankel matrix must be square!"));
+    (void)cols;  // avoid 'unused parameter' warning in optimized builds
+  }
+
+  /** @brief Resizes the matrix.
+      *   Existing entries can be preserved
+      *
+      * @param sz         New size of matrix
+      * @param preserve   If true, existing values are preserved.
+      */
+  void resize(vcl_size_t sz, bool preserve = true)
+  {
+    elements_.resize(sz, preserve);
+  }
+
+  /** @brief Returns the OpenCL handle
+      *
+      *   @return OpenCL handle
+      */
+  handle_type const & handle() const { return elements_.handle(); }
+
+  /**
+       * @brief Returns an internal viennacl::toeplitz_matrix, which represents a Hankel matrix elements
+       *
+       */
+  toeplitz_matrix<NumericT, AlignmentV> & elements() { return elements_; }
+  toeplitz_matrix<NumericT, AlignmentV> const & elements() const { return elements_; }
+
+  /**
+       * @brief Returns the number of rows of the matrix
+       */
+  vcl_size_t size1() const { return elements_.size1(); }
+
+  /**
+       * @brief Returns the number of columns of the matrix
+       */
+  vcl_size_t size2() const { return elements_.size2(); }
+
+  /** @brief Returns the internal size of matrix representtion.
+      *   Usually required for launching OpenCL kernels only
+      *
+      *   @return Internal size of matrix representation
+      */
+  vcl_size_t internal_size() const { return elements_.internal_size(); }
+
+  /**
+       * @brief Read-write access to a element of the matrix
+       *
+       * @param row_index  Row index of accessed element
+       * @param col_index  Column index of accessed element
+       * @return Proxy for matrix entry
+       */
+  entry_proxy<NumericT> operator()(unsigned int row_index, unsigned int col_index)
+  {
+    assert(row_index < size1() && col_index < size2() && bool("Invalid access"));
+
+    return elements_(size1() - row_index - 1, col_index);
+  }
+
+  /**
+       * @brief += operation for Hankel matrices
+       *
+       * @param that Matrix which will be added
+       * @return Result of addition
+       */
+  hankel_matrix<NumericT, AlignmentV>& operator +=(hankel_matrix<NumericT, AlignmentV>& that)
+  {
+    elements_ += that.elements();
+    return *this;
+  }
+
+private:
+  hankel_matrix(hankel_matrix const &) {}
+  hankel_matrix & operator=(hankel_matrix const & t);
+
+  toeplitz_matrix<NumericT, AlignmentV> elements_;
+};
+
+/** @brief Copies a Hankel matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU)
+  *
+  *
+  * @param cpu_vec   A std::vector on the host.
+  * @param gpu_mat   A hankel_matrix from ViennaCL
+  */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(std::vector<NumericT> const & cpu_vec, hankel_matrix<NumericT, AlignmentV> & gpu_mat)
+{
+  assert((gpu_mat.size1() * 2 - 1)  == cpu_vec.size() && bool("Size mismatch"));
+
+  copy(cpu_vec, gpu_mat.elements());
+}
+
+/** @brief Copies a Hankel matrix from the OpenCL device (either GPU or multi-core CPU) to the std::vector
+  *
+  *
+  * @param gpu_mat   A hankel_matrix from ViennaCL
+  * @param cpu_vec   A std::vector on the host.
+  */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(hankel_matrix<NumericT, AlignmentV> const & gpu_mat, std::vector<NumericT> & cpu_vec)
+{
+  assert((gpu_mat.size1() * 2 - 1)  == cpu_vec.size() && bool("Size mismatch"));
+
+  copy(gpu_mat.elements(), cpu_vec);
+}
+
+/** @brief Copies a Hankel matrix from the OpenCL device (either GPU or multi-core CPU) to the matrix-like object
+  *
+  *
+  * @param han_src   A hankel_matrix from ViennaCL
+  * @param com_dst   A matrix-like object
+  */
+template<typename NumericT, unsigned int AlignmentV, typename MatrixT>
+void copy(hankel_matrix<NumericT, AlignmentV> const & han_src, MatrixT& com_dst)
+{
+  assert( (viennacl::traits::size1(com_dst) == han_src.size1()) && bool("Size mismatch") );
+  assert( (viennacl::traits::size2(com_dst) == han_src.size2()) && bool("Size mismatch") );
+
+  vcl_size_t size = han_src.size1();
+  std::vector<NumericT> tmp(size * 2 - 1);
+  copy(han_src, tmp);
+
+  for (vcl_size_t i = 0; i < size; i++)
+    for (vcl_size_t j = 0; j < size; j++)
+      com_dst(i, j) = tmp[i + j];
+}
+
+/** @brief Copies a the matrix-like object to the Hankel matrix from the OpenCL device (either GPU or multi-core CPU)
+  *
+  *
+  * @param com_src   A std::vector on the host
+  * @param han_dst   A hankel_matrix from ViennaCL
+  */
+template<typename NumericT, unsigned int AlignmentV, typename MatrixT>
+void copy(MatrixT const & com_src, hankel_matrix<NumericT, AlignmentV>& han_dst)
+{
+  assert( (han_dst.size1() == 0 || viennacl::traits::size1(com_src) == han_dst.size1()) && bool("Size mismatch") );
+  assert( (han_dst.size2() == 0 || viennacl::traits::size2(com_src) == han_dst.size2()) && bool("Size mismatch") );
+  assert( viennacl::traits::size2(com_src) == viennacl::traits::size1(com_src) && bool("Logic error: non-square Hankel matrix!") );
+
+  vcl_size_t size = viennacl::traits::size1(com_src);
+
+  std::vector<NumericT> tmp(2*size - 1);
+
+  for (vcl_size_t i = 0; i < size; i++)
+    tmp[i] = com_src(0, i);
+
+  for (vcl_size_t i = 1; i < size; i++)
+    tmp[size + i - 1] = com_src(size - 1, i);
+
+  viennacl::copy(tmp, han_dst);
+}
+
+/*template<typename NumericT, unsigned int AlignmentV, unsigned int VECTOR_AlignmentV>
+  void prod_impl(hankel_matrix<NumericT, AlignmentV>& mat,
+                 vector<NumericT, VECTOR_AlignmentV>& vec,
+                 vector<NumericT, VECTOR_AlignmentV>& result)
+  {
+      prod_impl(mat.elements(), vec, result);
+      fft::reverse(result);
+  }*/
+
+template<class NumericT, unsigned int AlignmentV>
+std::ostream & operator<<(std::ostream & s, hankel_matrix<NumericT, AlignmentV>& gpu_matrix)
+{
+  vcl_size_t size = gpu_matrix.size1();
+  std::vector<NumericT> tmp(2*size - 1);
+  copy(gpu_matrix, tmp);
+  s << "[" << size << "," << size << "](";
+
+  for (vcl_size_t i = 0; i < size; i++)
+  {
+    s << "(";
+    for (vcl_size_t j = 0; j < size; j++)
+    {
+      s << tmp[i + j];
+      //s << (int)i - (int)j;
+      if (j < (size - 1)) s << ",";
+    }
+    s << ")";
+  }
+  s << ")";
+  return s;
+}
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+  // x = A * y
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x = A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+        lhs = temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+      lhs += temp;
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+      lhs -= temp;
+    }
+  };
+
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const hankel_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs());
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hankel_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs());
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs += temp_result;
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hankel_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs());
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs -= temp_result;
+    }
+  };
+
+
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+#endif // VIENNACL_HANKEL_MATRIX_HPP

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/hyb_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/hyb_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/hyb_matrix.hpp
new file mode 100644
index 0000000..e93ede5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/hyb_matrix.hpp
@@ -0,0 +1,442 @@
+#ifndef VIENNACL_HYB_MATRIX_HPP_
+#define VIENNACL_HYB_MATRIX_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/hyb_matrix.hpp
+    @brief Implementation of the hyb_matrix class
+
+    Contributed by Volodymyr Kysenko.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+namespace viennacl
+{
+/** @brief Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros. */
+template<typename NumericT, unsigned int AlignmentV  /* see forwards.h for default argument */>
+class hyb_matrix
+{
+public:
+  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
+
+  hyb_matrix() : csr_threshold_(NumericT(0.8)), rows_(0), cols_(0) {}
+
+  hyb_matrix(viennacl::context ctx) : csr_threshold_(NumericT(0.8)), rows_(0), cols_(0)
+  {
+    ell_coords_.switch_active_handle_id(ctx.memory_type());
+    ell_elements_.switch_active_handle_id(ctx.memory_type());
+
+    csr_rows_.switch_active_handle_id(ctx.memory_type());
+    csr_cols_.switch_active_handle_id(ctx.memory_type());
+    csr_elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+    if (ctx.memory_type() == OPENCL_MEMORY)
+    {
+      ell_coords_.opencl_handle().context(ctx.opencl_context());
+      ell_elements_.opencl_handle().context(ctx.opencl_context());
+
+      csr_rows_.opencl_handle().context(ctx.opencl_context());
+      csr_cols_.opencl_handle().context(ctx.opencl_context());
+      csr_elements_.opencl_handle().context(ctx.opencl_context());
+    }
+#endif
+  }
+
+  /** @brief Resets all entries in the matrix back to zero without changing the matrix size. Resets the sparsity pattern. */
+  void clear()
+  {
+    // ELL part:
+    ellnnz_ = 0;
+
+    viennacl::backend::typesafe_host_array<unsigned int> host_coords_buffer(ell_coords_, internal_size1());
+    std::vector<NumericT> host_elements(internal_size1());
+
+    viennacl::backend::memory_create(ell_coords_,   host_coords_buffer.element_size() * internal_size1(), viennacl::traits::context(ell_coords_),   host_coords_buffer.get());
+    viennacl::backend::memory_create(ell_elements_, sizeof(NumericT) * internal_size1(),                  viennacl::traits::context(ell_elements_), &(host_elements[0]));
+
+    // CSR part:
+    csrnnz_ = 0;
+
+    viennacl::backend::typesafe_host_array<unsigned int> host_row_buffer(csr_rows_, rows_ + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> host_col_buffer(csr_cols_, 1);
+    host_elements.resize(1);
+
+    viennacl::backend::memory_create(csr_rows_,     host_row_buffer.element_size() * (rows_ + 1), viennacl::traits::context(csr_rows_),     host_row_buffer.get());
+    viennacl::backend::memory_create(csr_cols_,     host_col_buffer.element_size() * 1,           viennacl::traits::context(csr_cols_),     host_col_buffer.get());
+    viennacl::backend::memory_create(csr_elements_, sizeof(NumericT) * 1,                         viennacl::traits::context(csr_elements_), &(host_elements[0]));
+  }
+
+  NumericT  csr_threshold()  const { return csr_threshold_; }
+  void csr_threshold(NumericT thr) { csr_threshold_ = thr; }
+
+  vcl_size_t internal_size1() const { return viennacl::tools::align_to_multiple<vcl_size_t>(rows_, AlignmentV); }
+  vcl_size_t internal_size2() const { return viennacl::tools::align_to_multiple<vcl_size_t>(cols_, AlignmentV); }
+
+  vcl_size_t size1() const { return rows_; }
+  vcl_size_t size2() const { return cols_; }
+
+  vcl_size_t internal_ellnnz() const {return viennacl::tools::align_to_multiple<vcl_size_t>(ellnnz_, AlignmentV); }
+  vcl_size_t ell_nnz() const { return ellnnz_; }
+  vcl_size_t csr_nnz() const { return csrnnz_; }
+
+  const handle_type & handle() const { return ell_elements_; }
+  const handle_type & handle2() const { return ell_coords_; }
+  const handle_type & handle3() const { return csr_rows_; }
+  const handle_type & handle4() const { return csr_cols_; }
+  const handle_type & handle5() const { return csr_elements_; }
+
+public:
+#if defined(_MSC_VER) && _MSC_VER < 1500          //Visual Studio 2005 needs special treatment
+  template<typename CPUMatrixT>
+  friend void copy(const CPUMatrixT & cpu_matrix, hyb_matrix & gpu_matrix );
+#else
+  template<typename CPUMatrixT, typename T, unsigned int ALIGN>
+  friend void copy(const CPUMatrixT & cpu_matrix, hyb_matrix<T, ALIGN> & gpu_matrix );
+#endif
+
+private:
+  NumericT  csr_threshold_;
+  vcl_size_t rows_;
+  vcl_size_t cols_;
+  vcl_size_t ellnnz_;
+  vcl_size_t csrnnz_;
+
+  handle_type ell_coords_; // ell coords
+  handle_type ell_elements_; // ell elements
+
+  handle_type csr_rows_;
+  handle_type csr_cols_;
+  handle_type csr_elements_;
+};
+
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const CPUMatrixT& cpu_matrix, hyb_matrix<NumericT, AlignmentV>& gpu_matrix )
+{
+  assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  if (cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0)
+  {
+    //determine max capacity for row
+    vcl_size_t max_entries_per_row = 0;
+    std::vector<vcl_size_t> hist_entries(cpu_matrix.size2() + 1, 0);
+
+    for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+    {
+      vcl_size_t num_entries = 0;
+      for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+      {
+        ++num_entries;
+      }
+
+      hist_entries[num_entries] += 1;
+      max_entries_per_row = std::max(max_entries_per_row, num_entries);
+    }
+
+    vcl_size_t sum = 0;
+    for (vcl_size_t ind = 0; ind <= max_entries_per_row; ind++)
+    {
+      sum += hist_entries[ind];
+
+      if (NumericT(sum) >= NumericT(gpu_matrix.csr_threshold()) * NumericT(cpu_matrix.size1()))
+      {
+        max_entries_per_row = ind;
+        break;
+      }
+    }
+
+    //setup GPU matrix
+    gpu_matrix.ellnnz_ = max_entries_per_row;
+    gpu_matrix.rows_ = cpu_matrix.size1();
+    gpu_matrix.cols_ = cpu_matrix.size2();
+
+    vcl_size_t nnz = gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz();
+
+    viennacl::backend::typesafe_host_array<unsigned int>  ell_coords(gpu_matrix.ell_coords_, nnz);
+    viennacl::backend::typesafe_host_array<unsigned int>  csr_rows(gpu_matrix.csr_rows_, cpu_matrix.size1() + 1);
+    std::vector<unsigned int> csr_cols;
+
+    std::vector<NumericT> ell_elements(nnz);
+    std::vector<NumericT> csr_elements;
+
+    vcl_size_t csr_index = 0;
+
+    for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+    {
+      vcl_size_t data_index = 0;
+
+      csr_rows.set(row_it.index1(), csr_index);
+
+      for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+      {
+        if (data_index < max_entries_per_row)
+        {
+          ell_coords.set(gpu_matrix.internal_size1() * data_index + col_it.index1(), col_it.index2());
+          ell_elements[gpu_matrix.internal_size1() * data_index + col_it.index1()] = *col_it;
+        }
+        else
+        {
+          csr_cols.push_back(static_cast<unsigned int>(col_it.index2()));
+          csr_elements.push_back(*col_it);
+
+          csr_index++;
+        }
+
+        data_index++;
+      }
+
+    }
+
+    if (csr_cols.empty())
+    {
+      csr_cols.push_back(0);
+      csr_elements.push_back(0);
+    }
+
+    csr_rows.set(csr_rows.size() - 1, csr_index);
+
+    gpu_matrix.csrnnz_ = csr_cols.size();
+
+    viennacl::backend::typesafe_host_array<unsigned int> csr_cols_for_gpu(gpu_matrix.csr_cols_, csr_cols.size());
+    for (vcl_size_t i=0; i<csr_cols.size(); ++i)
+      csr_cols_for_gpu.set(i, csr_cols[i]);
+
+    viennacl::backend::memory_create(gpu_matrix.ell_coords_,   ell_coords.raw_size(),                    traits::context(gpu_matrix.ell_coords_), ell_coords.get());
+    viennacl::backend::memory_create(gpu_matrix.ell_elements_, sizeof(NumericT) * ell_elements.size(), traits::context(gpu_matrix.ell_elements_), &(ell_elements[0]));
+
+    viennacl::backend::memory_create(gpu_matrix.csr_rows_,     csr_rows.raw_size(),                      traits::context(gpu_matrix.csr_rows_), csr_rows.get());
+    viennacl::backend::memory_create(gpu_matrix.csr_cols_,     csr_cols_for_gpu.raw_size(),              traits::context(gpu_matrix.csr_cols_), csr_cols_for_gpu.get());
+    viennacl::backend::memory_create(gpu_matrix.csr_elements_, sizeof(NumericT) * csr_elements.size(), traits::context(gpu_matrix.csr_elements_), &(csr_elements[0]));
+  }
+}
+
+
+/** @brief Copies a sparse matrix from the host to the compute device. The host type is the std::vector< std::map < > > format .
+  *
+  * @param cpu_matrix   A sparse matrix on the host composed of an STL vector and an STL map.
+  * @param gpu_matrix   The sparse hyb_matrix from ViennaCL
+  */
+template<typename IndexT, typename NumericT, unsigned int AlignmentV>
+void copy(std::vector< std::map<IndexT, NumericT> > const & cpu_matrix,
+          hyb_matrix<NumericT, AlignmentV> & gpu_matrix)
+{
+  vcl_size_t max_col = 0;
+  for (vcl_size_t i=0; i<cpu_matrix.size(); ++i)
+  {
+    if (cpu_matrix[i].size() > 0)
+      max_col = std::max<vcl_size_t>(max_col, (cpu_matrix[i].rbegin())->first);
+  }
+
+  viennacl::copy(tools::const_sparse_matrix_adapter<NumericT, IndexT>(cpu_matrix, cpu_matrix.size(), max_col + 1), gpu_matrix);
+}
+
+
+
+
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const hyb_matrix<NumericT, AlignmentV>& gpu_matrix, CPUMatrixT& cpu_matrix)
+{
+  assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  if (gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0)
+  {
+    std::vector<NumericT> ell_elements(gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz());
+    viennacl::backend::typesafe_host_array<unsigned int> ell_coords(gpu_matrix.handle2(), gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz());
+
+    std::vector<NumericT> csr_elements(gpu_matrix.csr_nnz());
+    viennacl::backend::typesafe_host_array<unsigned int> csr_rows(gpu_matrix.handle3(), gpu_matrix.size1() + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> csr_cols(gpu_matrix.handle4(), gpu_matrix.csr_nnz());
+
+    viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(NumericT) * ell_elements.size(), &(ell_elements[0]));
+    viennacl::backend::memory_read(gpu_matrix.handle2(), 0, ell_coords.raw_size(), ell_coords.get());
+    viennacl::backend::memory_read(gpu_matrix.handle3(), 0, csr_rows.raw_size(),   csr_rows.get());
+    viennacl::backend::memory_read(gpu_matrix.handle4(), 0, csr_cols.raw_size(),   csr_cols.get());
+    viennacl::backend::memory_read(gpu_matrix.handle5(), 0, sizeof(NumericT) * csr_elements.size(), &(csr_elements[0]));
+
+
+    for (vcl_size_t row = 0; row < gpu_matrix.size1(); row++)
+    {
+      for (vcl_size_t ind = 0; ind < gpu_matrix.internal_ellnnz(); ind++)
+      {
+        vcl_size_t offset = gpu_matrix.internal_size1() * ind + row;
+
+        NumericT val = ell_elements[offset];
+        if (val <= 0 && val >= 0) // val == 0 without compiler warnings
+          continue;
+
+        if (ell_coords[offset] >= gpu_matrix.size2())
+        {
+          std::cerr << "ViennaCL encountered invalid data " << offset << " " << ind << " " << row << " " << ell_coords[offset] << " " << gpu_matrix.size2() << std::endl;
+          return;
+        }
+
+        cpu_matrix(row, ell_coords[offset]) = val;
+      }
+
+      for (vcl_size_t ind = csr_rows[row]; ind < csr_rows[row+1]; ind++)
+      {
+        NumericT val = csr_elements[ind];
+        if (val <= 0 && val >= 0) // val == 0 without compiler warnings
+          continue;
+
+        if (csr_cols[ind] >= gpu_matrix.size2())
+        {
+          std::cerr << "ViennaCL encountered invalid data " << std::endl;
+          return;
+        }
+
+        cpu_matrix(row, csr_cols[ind]) = val;
+      }
+    }
+  }
+}
+
+/** @brief Copies a sparse matrix from the compute device to the host. The host type is the std::vector< std::map < > > format .
+  *
+  * @param gpu_matrix   The sparse hyb_matrix from ViennaCL
+  * @param cpu_matrix   A sparse matrix on the host composed of an STL vector and an STL map.
+  */
+template<typename NumericT, unsigned int AlignmentV, typename IndexT>
+void copy(const hyb_matrix<NumericT, AlignmentV> & gpu_matrix,
+          std::vector< std::map<IndexT, NumericT> > & cpu_matrix)
+{
+  if (cpu_matrix.size() == 0)
+    cpu_matrix.resize(gpu_matrix.size1());
+
+  assert(cpu_matrix.size() == gpu_matrix.size1() && bool("Matrix dimension mismatch!"));
+
+  tools::sparse_matrix_adapter<NumericT, IndexT> temp(cpu_matrix, cpu_matrix.size(), gpu_matrix.size2());
+  viennacl::copy(gpu_matrix, temp);
+}
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+  // x = A * y
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x = A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs = temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(0));
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x += A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs += temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(1));
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x -= A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs -= temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(-1), lhs, T(1));
+    }
+  };
+
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs += temp_result;
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs -= temp_result;
+    }
+  };
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/io/matrix_market.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/io/matrix_market.hpp b/native-viennaCL/src/main/cpp/viennacl/io/matrix_market.hpp
new file mode 100644
index 0000000..e8444ee
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/io/matrix_market.hpp
@@ -0,0 +1,440 @@
+#ifndef VIENNACL_IO_MATRIX_MARKET_HPP
+#define VIENNACL_IO_MATRIX_MARKET_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file matrix_market.hpp
+    @brief A reader and writer for the matrix market format is implemented here
+*/
+
+#include <algorithm>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <map>
+#include <cctype>
+#include "viennacl/tools/adapter.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/fill.hpp"
+
+namespace viennacl
+{
+namespace io
+{
+//helper
+namespace detail
+{
+  inline void trim(char * buffer, long max_size)
+  {
+    //trim at beginning of string
+    long start = 0;
+    for (long i=0; i<max_size; ++i)
+    {
+      if (buffer[i] == ' ')
+        ++start;
+      else
+        break;
+    }
+
+    //trim at end of string
+    long stop = start;
+    for (long i=stop; i<max_size; ++i)
+    {
+      if (buffer[i] == 0)   //end of string
+        break;
+
+      if (buffer[i] != ' ')
+        stop = i;
+    }
+
+    for (long i=0; i<=stop - start; ++i)
+    {
+      buffer[i] = buffer[start + i];
+    }
+
+    if (buffer[0] != ' ')
+      buffer[stop - start + 1] = 0; //terminate string
+    else
+      buffer[0] = 0;
+  }
+
+  inline std::string tolower(std::string & s)
+  {
+    std::transform(s.begin(), s.end(), s.begin(), static_cast < int(*)(int) > (std::tolower));
+    return s;
+  }
+
+
+
+} //namespace
+
+///////// reader ////////////
+
+/** @brief Reads a sparse or dense matrix from a file (MatrixMarket format)
+*
+* Note: If the matrix in the MatrixMarket file is complex, only the real-valued part is loaded!
+*
+* @param mat The matrix that is to be read
+* @param file Filename from which the matrix should be read
+* @param index_base The index base, typically 1
+* @tparam MatrixT A generic matrix type. Type requirements: size1() returns number of rows, size2() returns number columns, operator() writes array entries, resize() allows resizing the matrix.
+* @return Returns nonzero if file is read correctly
+*/
+template<typename MatrixT>
+long read_matrix_market_file_impl(MatrixT & mat,
+                                  const char * file,
+                                  long index_base)
+{
+  typedef typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<MatrixT>::type>::type    ScalarT;
+
+  //std::cout << "Reading matrix market file" << std::endl;
+  char buffer[1025];
+  std::ifstream reader(file);
+  std::string token;
+  long linenum = 0;
+  bool symmetric = false;
+  bool dense_format = false;
+  bool is_header = true;
+  bool pattern_matrix = false;
+  //bool is_complex = false;
+  long cur_row = 0;
+  long cur_col = 0;
+  long valid_entries = 0;
+  long nnz = 0;
+
+
+  if (!reader){
+    std::cerr << "ViennaCL: Matrix Market Reader: Cannot open file " << file << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  while (reader.good())
+  {
+    // get a non-empty line
+    do
+    {
+      reader.getline(buffer, 1024);
+      ++linenum;
+      detail::trim(buffer, 1024);
+    }
+    while (reader.good() && buffer[0] == 0);
+
+    if (buffer[0] == '%')
+    {
+      if (buffer[1] == '%')
+      {
+        //parse header:
+        std::stringstream line(std::string(buffer + 2));
+        line >> token;
+        if (detail::tolower(token) != "matrixmarket")
+        {
+          std::cerr << "Error in file " << file << " at line " << linenum << " in file " << file << ": Expected 'MatrixMarket', got '" << token << "'" << std::endl;
+          return 0;
+        }
+
+        line >> token;
+        if (detail::tolower(token) != "matrix")
+        {
+          std::cerr << "Error in file " << file << " at line " << linenum << " in file " << file << ": Expected 'matrix', got '" << token << "'" << std::endl;
+          return 0;
+        }
+
+        line >> token;
+        if (detail::tolower(token) != "coordinate")
+        {
+          if (detail::tolower(token) == "array")
+          {
+            dense_format = true;
+            std::cerr << "Error in file " << file << " at line " << linenum << " in file " << file << ": 'array' type is not supported yet!" << std::endl;
+            return 0;
+          }
+          else
+          {
+            std::cerr << "Error in file " << file << " at line " << linenum << " in file " << file << ": Expected 'array' or 'coordinate', got '" << token << "'" << std::endl;
+            return 0;
+          }
+        }
+
+        line >> token;
+        if (detail::tolower(token) == "pattern")
+        {
+          pattern_matrix = true;
+        }
+        else if (detail::tolower(token) == "complex")
+        {
+          //is_complex = true;
+        }
+        else if (detail::tolower(token) != "real")
+        {
+          std::cerr << "Error in file " << file << ": The MatrixMarket reader provided with ViennaCL supports only real valued floating point arithmetic or pattern type matrices." << std::endl;
+          return 0;
+        }
+
+        line >> token;
+        if (detail::tolower(token) == "general"){ }
+        else if (detail::tolower(token) == "symmetric"){ symmetric = true; }
+        else
+        {
+          std::cerr << "Error in file " << file << ": The MatrixMarket reader provided with ViennaCL supports only general or symmetric matrices." << std::endl;
+          return 0;
+        }
+
+      }
+    }
+    else
+    {
+      std::stringstream line(std::stringstream::in | std::stringstream::out);
+      line << std::string(buffer);
+
+      if (is_header)
+      {
+        //read header line
+        vcl_size_t rows;
+        vcl_size_t cols;
+
+        if (line.good())
+          line >> rows;
+        else
+        {
+          std::cerr << "Error in file " << file << ": Could not get matrix dimensions (rows) in line " << linenum << std::endl;
+          return 0;
+        }
+
+        if (line.good())
+          line >> cols;
+        else
+        {
+          std::cerr << "Error in file " << file << ": Could not get matrix dimensions (columns) in line " << linenum << std::endl;
+          return 0;
+        }
+        if (!dense_format)
+        {
+          if (line.good())
+            line >> nnz;
+          else
+          {
+            std::cerr << "Error in file " << file << ": Could not get matrix dimensions (columns) in line " << linenum << std::endl;
+            return 0;
+          }
+        }
+
+        if (rows > 0 && cols > 0)
+          viennacl::traits::resize(mat, rows, cols);
+
+        is_header = false;
+      }
+      else
+      {
+        //read data
+        if (dense_format)
+        {
+          ScalarT value;
+          line >> value;
+          viennacl::traits::fill(mat, static_cast<vcl_size_t>(cur_row), static_cast<vcl_size_t>(cur_col), value);
+
+          if (++cur_row == static_cast<long>(viennacl::traits::size1(mat)))
+          {
+            //next column
+            ++cur_col;
+            cur_row = 0;
+          }
+        }
+        else //sparse format
+        {
+          long row;
+          long col;
+          ScalarT value = ScalarT(1);
+
+          //parse data:
+          if (line.good())
+            line >> row;
+          else
+          {
+            std::cerr << "Error in file " << file << ": Parse error for matrix row entry in line " << linenum << std::endl;
+            return 0;
+          }
+
+          if (line.good())
+            line >> col;
+          else
+          {
+            std::cerr << "Error in file " << file << ": Parse error for matrix col entry in line " << linenum << std::endl;
+            return 0;
+          }
+
+          //take index_base base into account:
+          row -= index_base;
+          col -= index_base;
+
+          if (!pattern_matrix) // value for pattern matrix is implicitly 1, so we only need to read data for 'normal' matrices
+          {
+            if (line.good())
+            {
+                line >> value;
+            }
+            else
+            {
+              std::cerr << "Error in file " << file << ": Parse error for matrix entry in line " << linenum << std::endl;
+              return 0;
+            }
+          }
+
+          if (row >= static_cast<long>(viennacl::traits::size1(mat)) || row < 0)
+          {
+            std::cerr << "Error in file " << file << " at line " << linenum << ": Row index out of bounds: " << row << " (matrix dim: " << viennacl::traits::size1(mat) << " x " << viennacl::traits::size2(mat) << ")" << std::endl;
+            return 0;
+          }
+
+          if (col >= static_cast<long>(viennacl::traits::size2(mat)) || col < 0)
+          {
+            std::cerr << "Error in file " << file << " at line " << linenum << ": Column index out of bounds: " << col << " (matrix dim: " << viennacl::traits::size1(mat) << " x " << viennacl::traits::size2(mat) << ")" << std::endl;
+            return 0;
+          }
+
+          viennacl::traits::fill(mat, static_cast<vcl_size_t>(row), static_cast<vcl_size_t>(col), value); //basically equivalent to mat(row, col) = value;
+          if (symmetric)
+            viennacl::traits::fill(mat, static_cast<vcl_size_t>(col), static_cast<vcl_size_t>(row), value); //basically equivalent to mat(col, row) = value;
+
+          if (++valid_entries == nnz)
+            break;
+
+        } //else dense_format
+      }
+    }
+  }
+
+  //std::cout << linenum << " lines read." << std::endl;
+  reader.close();
+  return linenum;
+}
+
+
+/** @brief Reads a sparse matrix from a file (MatrixMarket format)
+*
+* @param mat The matrix that is to be read (ublas-types and std::vector< std::map <unsigned int, ScalarT> > are supported)
+* @param file The filename
+* @param index_base The index base, typically 1
+* @tparam MatrixT A generic matrix type. Type requirements: size1() returns number of rows, size2() returns number columns, operator() writes array entries, resize() allows resizing the matrix.
+* @return Returns nonzero if file is read correctly
+*/
+template<typename MatrixT>
+long read_matrix_market_file(MatrixT & mat,
+                             const char * file,
+                             long index_base = 1)
+{
+  return read_matrix_market_file_impl(mat, file, index_base);
+}
+
+template<typename MatrixT>
+long read_matrix_market_file(MatrixT & mat,
+                             const std::string & file,
+                             long index_base = 1)
+{
+  return read_matrix_market_file_impl(mat, file.c_str(), index_base);
+}
+
+template<typename ScalarT>
+long read_matrix_market_file(std::vector< std::map<unsigned int, ScalarT> > & mat,
+                             const char * file,
+                             long index_base = 1)
+{
+  viennacl::tools::sparse_matrix_adapter<ScalarT> adapted_matrix(mat);
+  return read_matrix_market_file_impl(adapted_matrix, file, index_base);
+}
+
+template<typename ScalarT>
+long read_matrix_market_file(std::vector< std::map<unsigned int, ScalarT> > & mat,
+                             const std::string & file,
+                             long index_base = 1)
+{
+  viennacl::tools::sparse_matrix_adapter<ScalarT> adapted_matrix(mat);
+  return read_matrix_market_file_impl(adapted_matrix, file.c_str(), index_base);
+}
+
+
+////////// writer /////////////
+template<typename MatrixT>
+void write_matrix_market_file_impl(MatrixT const & mat, const char * file, long index_base)
+{
+  std::ofstream writer(file);
+
+  long num_entries = 0;
+  for (typename MatrixT::const_iterator1 row_it = mat.begin1();
+       row_it != mat.end1();
+       ++row_it)
+    for (typename MatrixT::const_iterator2 col_it = row_it.begin();
+         col_it != row_it.end();
+         ++col_it)
+      ++num_entries;
+
+  writer << "%%MatrixMarket matrix coordinate real general" << std::endl;
+  writer << mat.size1() << " " << mat.size2() << " " << num_entries << std::endl;
+
+  for (typename MatrixT::const_iterator1 row_it = mat.begin1();
+       row_it != mat.end1();
+       ++row_it)
+    for (typename MatrixT::const_iterator2 col_it = row_it.begin();
+         col_it != row_it.end();
+         ++col_it)
+      writer << col_it.index1() + index_base << " " << col_it.index2() + index_base << " " << *col_it << std::endl;
+
+  writer.close();
+}
+
+template<typename ScalarT>
+void write_matrix_market_file(std::vector< std::map<unsigned int, ScalarT> > const & mat,
+                              const char * file,
+                              long index_base = 1)
+{
+  viennacl::tools::const_sparse_matrix_adapter<ScalarT> adapted_matrix(mat);
+  return write_matrix_market_file_impl(adapted_matrix, file, index_base);
+}
+
+template<typename ScalarT>
+void write_matrix_market_file(std::vector< std::map<unsigned int, ScalarT> > const & mat,
+                              const std::string & file,
+                              long index_base = 1)
+{
+  viennacl::tools::const_sparse_matrix_adapter<ScalarT> adapted_matrix(mat);
+  return write_matrix_market_file_impl(adapted_matrix, file.c_str(), index_base);
+}
+
+/** @brief Writes a sparse matrix to a file (MatrixMarket format)
+*
+* @param mat The matrix that is to be read (ublas-types and std::vector< std::map <unsigned int, ScalarT> > are supported)
+* @param file The filename
+* @param index_base The index base, typically 1
+* @tparam MatrixT A generic matrix type. Type requirements: size1() returns number of rows, size2() returns number columns, operator() writes array entries, resize() allows resizing the matrix.
+* @return Returns nonzero if file is read correctly
+*/
+template<typename MatrixT>
+void write_matrix_market_file(MatrixT const & mat,
+                              const std::string & file,
+                              long index_base = 1)
+{
+  write_matrix_market_file_impl(mat, file.c_str(), index_base);
+}
+
+
+} //namespace io
+} //namespace viennacl
+
+#endif


[12/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
new file mode 100644
index 0000000..64c12b0
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
@@ -0,0 +1,2645 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
+#define VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/opencl/kernels/bisect.hpp
+    @brief OpenCL kernels for the bisection algorithm for eigenvalues
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+// declaration, forward
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+  template <typename StringType>
+  void generate_bisect_kernel_config(StringType & source)
+  {
+    /* Global configuration parameter */
+    source.append("     #define  VIENNACL_BISECT_MAX_THREADS_BLOCK                256\n");
+    source.append("     #define  VIENNACL_BISECT_MAX_SMALL_MATRIX                 256\n");
+    source.append("     #define  VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX   256\n");
+    source.append("     #define  VIENNACL_BISECT_MIN_ABS_INTERVAL                 5.0e-37\n");
+
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Compute the next lower power of two of n
+  // n    number for which next higher power of two is seeked
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_floorPow2(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     inline int  \n");
+  source.append("     floorPow2(int n)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0);  \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+
+      // early out if already power of two
+  source.append("         if (0 == (n & (n-1)))  \n");
+  source.append("         {  \n");
+  source.append("             return n;  \n");
+  source.append("         }  \n");
+
+  source.append("         int exp;  \n");
+  source.append("         frexp(( "); source.append(numeric_string); source.append(" )n, &exp);  \n");
+  source.append("         return (1 << (exp - 1));  \n");
+  source.append("     }  \n");
+
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Compute the next higher power of two of n
+  // n  number for which next higher power of two is seeked
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_ceilPow2(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     inline int  \n");
+  source.append("     ceilPow2(int n)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+
+      // early out if already power of two
+  source.append("         if (0 == (n & (n-1)))  \n");
+  source.append("         {  \n");
+  source.append("             return n;  \n");
+  source.append("         }  \n");
+
+  source.append("         int exp;  \n");
+  source.append("         frexp(( "); source.append(numeric_string); source.append(" )n, &exp);  \n");
+  source.append("         return (1 << exp);  \n");
+  source.append("     }  \n");
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Compute midpoint of interval [\a left, \a right] avoiding overflow if possible
+  //
+  // left     left  / lower limit of interval
+  // right    right / upper limit of interval
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_computeMidpoint(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     inline "); source.append(numeric_string); source.append(" \n");
+  source.append("     computeMidpoint(const "); source.append(numeric_string); source.append(" left,\n");
+  source.append("       const "); source.append(numeric_string); source.append("  right)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+  source.append("          "); source.append(numeric_string); source.append("  mid;  \n");
+
+  source.append("         if (sign(left) == sign(right))  \n");
+  source.append("         {  \n");
+  source.append("             mid = left + (right - left) * 0.5f;  \n");
+  source.append("         }  \n");
+  source.append("         else  \n");
+  source.append("         {  \n");
+  source.append("             mid = (left + right) * 0.5f;  \n");
+  source.append("         }  \n");
+
+  source.append("         return mid;  \n");
+  source.append("     }  \n");
+
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Check if interval converged and store appropriately
+  //
+  // addr           address where to store the information of the interval
+  // s_left         shared memory storage for left interval limits
+  // s_right        shared memory storage for right interval limits
+  // s_left_count   shared memory storage for number of eigenvalues less than left interval limits
+  // s_right_count  shared memory storage for number of eigenvalues less than right interval limits
+  // left           lower limit of interval
+  // right          upper limit of interval
+  // left_count     eigenvalues less than \a left
+  // right_count    eigenvalues less than \a right
+  // precision      desired precision for eigenvalues
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename StringType>
+  void generate_bisect_kernel_storeInterval(StringType & source, std::string const & numeric_string)
+  {
+  source.append("     \n");
+  source.append("     void  \n");
+  source.append("     storeInterval(unsigned int addr,  \n");
+  source.append("                   __local "); source.append(numeric_string); source.append(" * s_left,   \n");
+  source.append("                   __local "); source.append(numeric_string); source.append(" * s_right,  \n");
+  source.append("                   __local unsigned int * s_left_count,  \n");
+  source.append("                   __local unsigned int * s_right_count,  \n");
+  source.append("                    "); source.append(numeric_string); source.append(" left,   \n");
+  source.append("                    "); source.append(numeric_string); source.append(" right,  \n");
+  source.append("                   unsigned int left_count, \n");
+  source.append("                   unsigned int right_count,  \n");
+  source.append("                    "); source.append(numeric_string); source.append("  precision)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("         s_left_count[addr] = left_count;  \n");
+  source.append("         s_right_count[addr] = right_count;  \n");
+
+      // check if interval converged
+  source.append("          "); source.append(numeric_string); source.append(" t0 = fabs(right - left);  \n");
+  source.append("          "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision;  \n");
+
+  source.append("         if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1))  \n");
+  source.append("         {  \n");
+          // compute mid point
+  source.append("              "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right);  \n");
+
+          // mark as converged
+  source.append("             s_left[addr] = lambda;  \n");
+  source.append("             s_right[addr] = lambda;  \n");
+  source.append("         }  \n");
+  source.append("         else  \n");
+  source.append("         {  \n");
+
+          // store current limits
+  source.append("             s_left[addr] = left;  \n");
+  source.append("             s_right[addr] = right;  \n");
+  source.append("         }  \n");
+
+  source.append("     }  \n");
+
+  }
+
+  template<typename StringType>
+  void generate_bisect_kernel_storeIntervalShort(StringType & source, std::string const & numeric_string)
+  {
+  source.append("     \n");
+  source.append("     void  \n");
+  source.append("     storeIntervalShort(unsigned int addr,  \n");
+  source.append("                   __local "); source.append(numeric_string); source.append(" * s_left,   \n");
+  source.append("                   __local "); source.append(numeric_string); source.append(" * s_right,  \n");
+  source.append("                   __local unsigned short * s_left_count,  \n");
+  source.append("                   __local unsigned short * s_right_count,  \n");
+  source.append("                    "); source.append(numeric_string); source.append(" left,   \n");
+  source.append("                    "); source.append(numeric_string); source.append(" right,  \n");
+  source.append("                   unsigned int left_count, \n");
+  source.append("                   unsigned int right_count,  \n");
+  source.append("                    "); source.append(numeric_string); source.append("  precision)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("         s_left_count[addr] = left_count;  \n");
+  source.append("         s_right_count[addr] = right_count;  \n");
+
+      // check if interval converged
+  source.append("          "); source.append(numeric_string); source.append(" t0 = fabs(right - left);  \n");
+  source.append("          "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision;  \n");
+
+  source.append("         if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1))  \n");
+  source.append("         {  \n");
+          // compute mid point
+  source.append("              "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right);  \n");
+
+          // mark as converged
+  source.append("             s_left[addr] = lambda;  \n");
+  source.append("             s_right[addr] = lambda;  \n");
+  source.append("         }  \n");
+  source.append("         else  \n");
+  source.append("         {  \n");
+
+          // store current limits
+  source.append("             s_left[addr] = left;  \n");
+  source.append("             s_right[addr] = right;  \n");
+  source.append("         }  \n");
+
+  source.append("     }  \n");
+
+
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Compute number of eigenvalues that are smaller than x given a symmetric,
+  // real, and tridiagonal matrix
+  //
+  // g_d                   diagonal elements stored in global memory
+  // g_s                   superdiagonal elements stored in global memory
+  // n                     size of matrix
+  // x                     value for which the number of eigenvalues that are smaller is sought
+  // tid                   thread identified (e.g. threadIdx.x or gtid)
+  // num_intervals_active  number of active intervals / threads that currently process an interval
+  // s_d                   scratch space to store diagonal entries of the tridiagonal matrix in shared memory
+  // s_s                   scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory
+  // converged             flag if the current thread is already converged (that is count does not have to be computed)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_computeNumSmallerEigenvals(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     inline unsigned int  \n");
+  source.append("     computeNumSmallerEigenvals(__global "); source.append(numeric_string); source.append(" *g_d,   \n");
+  source.append("                                __global "); source.append(numeric_string); source.append(" *g_s,   \n");
+  source.append("                                const unsigned int n,  \n");
+  source.append("                                const "); source.append(numeric_string); source.append(" x,         \n");
+  source.append("                                const unsigned int tid,  \n");
+  source.append("                                const unsigned int num_intervals_active,  \n");
+  source.append("                                __local "); source.append(numeric_string); source.append(" *s_d,  \n");
+  source.append("                                __local "); source.append(numeric_string); source.append(" *s_s,  \n");
+  source.append("                                unsigned int converged  \n");
+  source.append("                               )  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+
+  source.append("          "); source.append(numeric_string); source.append(" delta = 1.0f;  \n");
+  source.append("         unsigned int count = 0;  \n");
+
+  source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      // read data into shared memory
+  source.append("         if (lcl_id < n)  \n");
+  source.append("         {  \n");
+  source.append("             s_d[lcl_id] = *(g_d + lcl_id);  \n");
+  source.append("             s_s[lcl_id] = *(g_s + lcl_id - 1);  \n");
+  source.append("         }  \n");
+
+  source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      // perform loop only for active threads
+  source.append("         if ((tid < num_intervals_active) && (0 == converged))  \n");
+  source.append("         {  \n");
+
+          // perform (optimized) Gaussian elimination to determine the number
+          // of eigenvalues that are smaller than n
+  source.append("             for (unsigned int k = 0; k < n; ++k)  \n");
+  source.append("             {  \n");
+  source.append("                 delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;  \n");
+  source.append("                 count += (delta < 0) ? 1 : 0;  \n");
+  source.append("             }  \n");
+
+  source.append("         } \n"); // end if thread currently processing an interval
+
+  source.append("         return count;  \n");
+  source.append("     }  \n");
+
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Compute number of eigenvalues that are smaller than x given a symmetric,
+  // real, and tridiagonal matrix
+  //
+  // g_d                   diagonal elements stored in global memory
+  // g_s                   superdiagonal elements stored in global memory
+  // n                     size of matrix
+  // x                     value for which the number of eigenvalues that are smaller is seeked
+  // tid                   thread identified (e.g. threadIdx.x or gtid)
+  // num_intervals_active  number of active intervals / threads that currently process an interval
+  // s_d                   scratch space to store diagonal entries of the tridiagonal matrix in shared memory
+  // s_s                   scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory
+  // converged             flag if the current thread is already converged (that is count does not have to be computed)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     inline unsigned int  \n");
+  source.append("     computeNumSmallerEigenvalsLarge(__global "); source.append(numeric_string); source.append(" *g_d,   \n");
+  source.append("                                __global "); source.append(numeric_string); source.append(" *g_s,   \n");
+  source.append("                                const unsigned int n,  \n");
+  source.append("                                const "); source.append(numeric_string); source.append(" x,         \n");
+  source.append("                                const unsigned int tid,  \n");
+  source.append("                                const unsigned int num_intervals_active,  \n");
+  source.append("                                __local "); source.append(numeric_string); source.append(" *s_d,  \n");
+  source.append("                                __local "); source.append(numeric_string); source.append(" *s_s,  \n");
+  source.append("                                unsigned int converged  \n");
+  source.append("                               )  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("          "); source.append(numeric_string); source.append(" delta = 1.0f;  \n");
+  source.append("         unsigned int count = 0;  \n");
+
+  source.append("         unsigned int rem = n;  \n");
+
+      // do until whole diagonal and superdiagonal has been loaded and processed
+  source.append("         for (unsigned int i = 0; i < n; i += lcl_sz)  \n");
+  source.append("         {  \n");
+
+  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+          // read new chunk of data into shared memory
+  source.append("             if ((i + lcl_id) < n)  \n");
+  source.append("             {  \n");
+
+  source.append("                 s_d[lcl_id] = *(g_d + i + lcl_id);  \n");
+  source.append("                 s_s[lcl_id] = *(g_s + i + lcl_id - 1);  \n");
+  source.append("             }  \n");
+
+  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+
+  source.append("             if (tid < num_intervals_active)  \n");
+  source.append("             {  \n");
+
+              // perform (optimized) Gaussian elimination to determine the number
+              // of eigenvalues that are smaller than n
+  source.append("                 for (unsigned int k = 0; k < min(rem,lcl_sz); ++k)  \n");
+  source.append("                 {  \n");
+  source.append("                     delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;  \n");
+                  // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
+  source.append("                     count += (delta < 0) ? 1 : 0;  \n");
+  source.append("                 }  \n");
+
+  source.append("             } \n"); // end if thread currently processing an interval
+
+  source.append("             rem -= lcl_sz;  \n");
+  source.append("         }  \n");
+
+  source.append("         return count;  \n");
+  source.append("     }  \n");
+
+
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Store all non-empty intervals resulting from the subdivision of the interval
+  // currently processed by the thread
+  //
+  // addr                     base address for storing intervals
+  // num_threads_active       number of threads / intervals in current sweep
+  // s_left                   shared memory storage for left interval limits
+  // s_right                  shared memory storage for right interval limits
+  // s_left_count             shared memory storage for number of eigenvalues less than left interval limits
+  // s_right_count            shared memory storage for number of eigenvalues less than right interval limits
+  // left                     lower limit of interval
+  // mid                      midpoint of interval
+  // right                    upper limit of interval
+  // left_count               eigenvalues less than \a left
+  // mid_count                eigenvalues less than \a mid
+  // right_count              eigenvalues less than \a right
+  // precision                desired precision for eigenvalues
+  // compact_second_chunk     shared mem flag if second chunk is used and ergo requires compaction
+  // s_compaction_list_exc    helper array for stream compaction, s_compaction_list_exc[tid] = 1 when the thread generated two child intervals
+  // is_active_interval       mark is thread has a second non-empty child interval
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename StringType>
+  void generate_bisect_kernel_storeNonEmptyIntervals(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     storeNonEmptyIntervals(unsigned int addr,  \n");
+  source.append("                            const unsigned int num_threads_active,  \n");
+  source.append("                            __local "); source.append(numeric_string); source.append(" *s_left,   \n");
+  source.append("                            __local "); source.append(numeric_string); source.append(" *s_right,  \n");
+  source.append("                            __local unsigned int *s_left_count,  \n");
+  source.append("                            __local unsigned int *s_right_count,  \n");
+  source.append("                             "); source.append(numeric_string); source.append(" left, \n ");
+  source.append("                             "); source.append(numeric_string); source.append(" mid,  \n");
+  source.append("                             "); source.append(numeric_string); source.append(" right,\n");
+  source.append("                            const unsigned int left_count,  \n");
+  source.append("                            const unsigned int mid_count,  \n");
+  source.append("                            const unsigned int right_count,  \n");
+  source.append("                             "); source.append(numeric_string); source.append(" precision,  \n");
+  source.append("                            __local unsigned int *compact_second_chunk,  \n");
+  source.append("                            __local unsigned int *s_compaction_list_exc,  \n");
+  source.append("                            unsigned int *is_active_second)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+      // check if both child intervals are valid
+  source.append("          \n");
+  source.append("         if ((left_count != mid_count) && (mid_count != right_count))  \n");
+  source.append("         {  \n");
+
+          // store the left interval
+  source.append("             storeInterval(addr, s_left, s_right, s_left_count, s_right_count,  \n");
+  source.append("                           left, mid, left_count, mid_count, precision);  \n");
+
+          // mark that a second interval has been generated, only stored after
+          // stream compaction of second chunk
+  source.append("             *is_active_second = 1;  \n");
+  source.append("             s_compaction_list_exc[lcl_id] = 1;  \n");
+  source.append("             *compact_second_chunk = 1;  \n");
+  source.append("         }  \n");
+  source.append("         else  \n");
+  source.append("         {  \n");
+
+          // only one non-empty child interval
+
+          // mark that no second child
+  source.append("             *is_active_second = 0;  \n");
+  source.append("             s_compaction_list_exc[lcl_id] = 0;  \n");
+
+          // store the one valid child interval
+  source.append("             if (left_count != mid_count)  \n");
+  source.append("             {  \n");
+  source.append("                 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,  \n");
+  source.append("                               left, mid, left_count, mid_count, precision);  \n");
+  source.append("             }  \n");
+  source.append("             else  \n");
+  source.append("             {  \n");
+  source.append("                 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,  \n");
+  source.append("                               mid, right, mid_count, right_count, precision);  \n");
+  source.append("             }  \n");
+
+  source.append("         }  \n");
+  source.append("     }  \n");
+
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  //! Store all non-empty intervals resulting from the subdivision of the interval
+  //! currently processed by the thread
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType & source, std::string const & numeric_string)
+  {
+      source.append("       \n");
+      source.append("     void  \n");
+      source.append("     storeNonEmptyIntervalsLarge(unsigned int addr,  \n");
+      source.append("                            const unsigned int num_threads_active,  \n");
+      source.append("                            __local "); source.append(numeric_string); source.append(" *s_left,   \n");
+      source.append("                            __local "); source.append(numeric_string); source.append(" *s_right,  \n");
+      source.append("                            __local unsigned short *s_left_count,  \n");
+      source.append("                            __local unsigned short *s_right_count,  \n");
+      source.append("                             "); source.append(numeric_string); source.append(" left, \n ");
+      source.append("                             "); source.append(numeric_string); source.append(" mid,  \n");
+      source.append("                             "); source.append(numeric_string); source.append(" right,\n");
+      source.append("                            const unsigned int left_count,  \n");
+      source.append("                            const unsigned int mid_count,  \n");
+      source.append("                            const unsigned int right_count,  \n");
+      source.append("                             "); source.append(numeric_string); source.append(" epsilon,  \n");
+      source.append("                            __local unsigned int *compact_second_chunk,  \n");
+      source.append("                            __local unsigned short *s_compaction_list,  \n");
+      source.append("                            unsigned int *is_active_second)  \n");
+      source.append("     {  \n");
+      source.append("         uint glb_id = get_global_id(0); \n");
+      source.append("         uint grp_id = get_group_id(0); \n");
+      source.append("         uint grp_nm = get_num_groups(0); \n");
+      source.append("         uint lcl_id = get_local_id(0); \n");
+      source.append("         uint lcl_sz = get_local_size(0); \n");
+
+          // check if both child intervals are valid
+      source.append("         if ((left_count != mid_count) && (mid_count != right_count))  \n");
+      source.append("         {  \n");
+
+      source.append("             storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                           left, mid, left_count, mid_count, epsilon);  \n");
+
+      source.append("             *is_active_second = 1;  \n");
+      source.append("             s_compaction_list[lcl_id] = 1;  \n");
+      source.append("             *compact_second_chunk = 1;  \n");
+      source.append("         }  \n");
+      source.append("         else  \n");
+      source.append("         {  \n");
+
+              // only one non-empty child interval
+
+              // mark that no second child
+      source.append("             *is_active_second = 0;  \n");
+      source.append("             s_compaction_list[lcl_id] = 0;  \n");
+
+              // store the one valid child interval
+      source.append("             if (left_count != mid_count)  \n");
+      source.append("             {  \n");
+      source.append("                 storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                               left, mid, left_count, mid_count, epsilon);  \n");
+      source.append("             }  \n");
+      source.append("             else  \n");
+      source.append("             {  \n");
+      source.append("                 storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                               mid, right, mid_count, right_count, epsilon);  \n");
+      source.append("             }  \n");
+      source.append("         }  \n");
+      source.append("     }  \n");
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Create indices for compaction, that is process \a s_compaction_list_exc
+  // which is 1 for intervals that generated a second child and 0 otherwise
+  // and create for each of the non-zero elements the index where the new
+  // interval belongs to in a compact representation of all generated second children
+  //
+  // s_compaction_list_exc      list containing the flags which threads generated two children
+  // num_threads_compaction     number of threads to employ for compaction
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename StringType>
+  void generate_bisect_kernel_createIndicesCompaction(StringType & source)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     createIndicesCompaction(__local unsigned int *s_compaction_list_exc,  \n");
+  source.append("                             unsigned int num_threads_compaction)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+
+  source.append("         unsigned int offset = 1;  \n");
+  source.append("         const unsigned int tid = lcl_id;  \n");
+     // if(tid == 0)
+       // printf("num_threads_compaction = %u\n", num_threads_compaction);
+
+      // higher levels of scan tree
+  source.append("         for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)  \n");
+  source.append("         {  \n");
+
+  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+  source.append("             if (tid < d)  \n");
+  source.append("             {  \n");
+
+  source.append("                 unsigned int  ai = offset*(2*tid+1)-1;  \n");
+  source.append("                 unsigned int  bi = offset*(2*tid+2)-1;  \n");
+  source.append("              \n");
+  source.append("                 s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]  \n");
+  source.append("                                               + s_compaction_list_exc[ai];  \n");
+  source.append("             }  \n");
+
+  source.append("             offset <<= 1;  \n");
+  source.append("         }  \n");
+
+      // traverse down tree: first down to level 2 across
+  source.append("         for (int d = 2; d < num_threads_compaction; d <<= 1)  \n");
+  source.append("         {  \n");
+
+  source.append("             offset >>= 1;  \n");
+  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+  source.append("             if (tid < (d-1))  \n");
+  source.append("             {  \n");
+
+  source.append("                 unsigned int  ai = offset*(tid+1) - 1;  \n");
+  source.append("                 unsigned int  bi = ai + (offset >> 1);  \n");
+
+  source.append("                 s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]  \n");
+  source.append("                                               + s_compaction_list_exc[ai];  \n");
+  source.append("             }  \n");
+  source.append("         }  \n");
+
+  source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+  source.append("     }  \n");
+  }
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_createIndicesCompactionShort(StringType & source)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     createIndicesCompactionShort(__local unsigned short *s_compaction_list_exc,  \n");
+  source.append("                             unsigned int num_threads_compaction)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+
+  source.append("         unsigned int offset = 1;  \n");
+  source.append("         const unsigned int tid = lcl_id;  \n");
+
+      // higher levels of scan tree
+  source.append("         for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)  \n");
+  source.append("         {  \n");
+
+  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+  source.append("             if (tid < d)  \n");
+  source.append("             {  \n");
+
+  source.append("                 unsigned int  ai = offset*(2*tid+1)-1;  \n");
+  source.append("                 unsigned int  bi = offset*(2*tid+2)-1;  \n");
+  source.append("              \n");
+  source.append("                 s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]  \n");
+  source.append("                                               + s_compaction_list_exc[ai];  \n");
+  source.append("             }  \n");
+
+  source.append("             offset <<= 1;  \n");
+  source.append("         }  \n");
+
+      // traverse down tree: first down to level 2 across
+  source.append("         for (int d = 2; d < num_threads_compaction; d <<= 1)  \n");
+  source.append("         {  \n");
+
+  source.append("             offset >>= 1;  \n");
+  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+  source.append("             if (tid < (d-1))  \n");
+  source.append("             {  \n");
+
+  source.append("                 unsigned int  ai = offset*(tid+1) - 1;  \n");
+  source.append("                 unsigned int  bi = ai + (offset >> 1);  \n");
+
+  source.append("                 s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]  \n");
+  source.append("                                               + s_compaction_list_exc[ai];  \n");
+  source.append("             }  \n");
+  source.append("         }  \n");
+
+  source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+  source.append("     }  \n");
+  }
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Perform stream compaction for second child intervals
+  //
+  // s_left              shared memory storage for left interval limits
+  // s_right             shared memory storage for right interval limits
+  // s_left_count        shared memory storage for number of eigenvalues less than left interval limits
+  // s_right_count       shared memory storage for number of eigenvalues less than right interval limits
+  // mid                 midpoint of current interval (left of new interval)
+  // right               upper limit of interval
+  // mid_count           eigenvalues less than \a mid
+  // s_compaction_list   list containing the indices where the data has to be stored
+  // num_threads_active  number of active threads / intervals
+  // is_active_interval  mark is thread has a second non-empty child interval
+  ///////////////////////////////////////////////////////////////////////////////
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_compactIntervals(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     compactIntervals(__local "); source.append(numeric_string); source.append(" *s_left,  \n");
+  source.append("                      __local "); source.append(numeric_string); source.append(" *s_right, \n");
+  source.append("                      __local unsigned int *s_left_count, \n");
+  source.append("                      __local unsigned int *s_right_count,  \n");
+  source.append("                       "); source.append(numeric_string); source.append(" mid,  \n");
+  source.append("                       "); source.append(numeric_string); source.append(" right, \n");
+  source.append("                      unsigned int mid_count, unsigned int right_count,  \n");
+  source.append("                      __local unsigned int *s_compaction_list,  \n");
+  source.append("                      unsigned int num_threads_active,  \n");
+  source.append("                      unsigned int is_active_second)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("         const unsigned int tid = lcl_id;  \n");
+
+      // perform compaction / copy data for all threads where the second
+      // child is not dead
+  source.append("         if ((tid < num_threads_active) && (1 == is_active_second))  \n");
+  source.append("         {  \n");
+  source.append("             unsigned int addr_w = num_threads_active + s_compaction_list[tid];  \n");
+  source.append("             s_left[addr_w] = mid;  \n");
+  source.append("             s_right[addr_w] = right;  \n");
+  source.append("             s_left_count[addr_w] = mid_count;  \n");
+  source.append("             s_right_count[addr_w] = right_count;  \n");
+  source.append("         }  \n");
+  source.append("     }  \n");
+  }
+
+
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_compactIntervalsShort(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     compactIntervalsShort(__local "); source.append(numeric_string); source.append(" *s_left,  \n");
+  source.append("                      __local "); source.append(numeric_string); source.append(" *s_right,  \n");
+  source.append("                      __local unsigned short *s_left_count, \n");
+  source.append("                      __local unsigned short *s_right_count,  \n");
+  source.append("                      "); source.append(numeric_string); source.append(" mid,   \n");
+  source.append("                      "); source.append(numeric_string); source.append(" right, \n");
+  source.append("                      unsigned int mid_count, unsigned int right_count,  \n");
+  source.append("                      __local unsigned short *s_compaction_list,  \n");
+  source.append("                      unsigned int num_threads_active,  \n");
+  source.append("                      unsigned int is_active_second)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("         const unsigned int tid = lcl_id;  \n");
+
+      // perform compaction / copy data for all threads where the second
+      // child is not dead
+  source.append("         if ((tid < num_threads_active) && (1 == is_active_second))  \n");
+  source.append("         {  \n");
+  source.append("             unsigned int addr_w = num_threads_active + s_compaction_list[tid];  \n");
+  source.append("             s_left[addr_w] = mid;  \n");
+  source.append("             s_right[addr_w] = right;  \n");
+  source.append("             s_left_count[addr_w] = mid_count;  \n");
+  source.append("             s_right_count[addr_w] = right_count;  \n");
+  source.append("         }  \n");
+  source.append("     }  \n");
+  }
+
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_storeIntervalConverged(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     storeIntervalConverged( __local "); source.append(numeric_string); source.append(" *s_left,   \n");
+  source.append("                             __local "); source.append(numeric_string); source.append(" *s_right,   \n");
+  source.append("                            __local unsigned int *s_left_count, \n");
+  source.append("                            __local unsigned int *s_right_count,  \n");
+  source.append("                            "); source.append(numeric_string); source.append(" *left,   \n");
+  source.append("                            "); source.append(numeric_string); source.append(" *mid,   \n");
+  source.append("                            "); source.append(numeric_string); source.append(" *right,   \n");
+  source.append("                            unsigned int *left_count,     \n");
+  source.append("                            unsigned int *mid_count,      \n");
+  source.append("                            unsigned int *right_count,     \n");
+  source.append("                            __local unsigned int *s_compaction_list_exc,  \n");
+  source.append("                            __local unsigned int *compact_second_chunk,  \n");
+  source.append("                            const unsigned int num_threads_active,  \n");
+  source.append("                            unsigned int *is_active_second)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("         const unsigned int tid = lcl_id;  \n");
+  source.append("         const unsigned int multiplicity = *right_count - *left_count;  \n");
+      // check multiplicity of eigenvalue
+  source.append("         if (1 == multiplicity)  \n");
+  source.append("         {  \n");
+
+          // just re-store intervals, simple eigenvalue
+  source.append("             s_left[tid] = *left;  \n");
+  source.append("             s_right[tid] = *right;  \n");
+  source.append("             s_left_count[tid] = *left_count;  \n");
+  source.append("             s_right_count[tid] = *right_count;  \n");
+  source.append("             \n");
+
+          // mark that no second child / clear
+  source.append("             *is_active_second = 0;  \n");
+  source.append("             s_compaction_list_exc[tid] = 0;  \n");
+  source.append("         }  \n");
+  source.append("         else  \n");
+  source.append("         {  \n");
+
+          // number of eigenvalues after the split less than mid
+  source.append("             *mid_count = *left_count + (multiplicity >> 1);  \n");
+
+          // store left interval
+  source.append("             s_left[tid] = *left;  \n");
+  source.append("             s_right[tid] = *right;  \n");
+  source.append("             s_left_count[tid] = *left_count;  \n");
+  source.append("             s_right_count[tid] = *mid_count;  \n");
+  source.append("             *mid = *left;  \n");
+
+          // mark that second child interval exists
+  source.append("             *is_active_second = 1;  \n");
+  source.append("             s_compaction_list_exc[tid] = 1;  \n");
+  source.append("             *compact_second_chunk = 1;  \n");
+  source.append("         }  \n");
+  source.append("     }  \n");
+  }
+
+
+
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_storeIntervalConvergedShort(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     storeIntervalConvergedShort(__local "); source.append(numeric_string); source.append(" *s_left,   \n");
+  source.append("                             __local "); source.append(numeric_string); source.append(" *s_right,   \n");
+  source.append("                            __local unsigned short *s_left_count, \n");
+  source.append("                            __local unsigned short *s_right_count,  \n");
+  source.append("                            "); source.append(numeric_string); source.append(" *left,   \n");
+  source.append("                            "); source.append(numeric_string); source.append(" *mid,   \n");
+  source.append("                            "); source.append(numeric_string); source.append(" *right,   \n");
+  source.append("                            unsigned int *left_count,     \n");
+  source.append("                            unsigned int *mid_count,      \n");
+  source.append("                            unsigned int *right_count,     \n");
+  source.append("                            __local unsigned short *s_compaction_list_exc,  \n");
+  source.append("                            __local unsigned int *compact_second_chunk,  \n");
+  source.append("                            const unsigned int num_threads_active,  \n");
+  source.append("                            unsigned int *is_active_second)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("         const unsigned int tid = lcl_id;  \n");
+  source.append("         const unsigned int multiplicity = *right_count - *left_count;  \n");
+      // check multiplicity of eigenvalue
+  source.append("         if (1 == multiplicity)  \n");
+  source.append("         {  \n");
+
+          // just re-store intervals, simple eigenvalue
+  source.append("             s_left[tid] = *left;  \n");
+  source.append("             s_right[tid] = *right;  \n");
+  source.append("             s_left_count[tid] = *left_count;  \n");
+  source.append("             s_right_count[tid] = *right_count;  \n");
+  source.append("             \n");
+
+          // mark that no second child / clear
+  source.append("             *is_active_second = 0;  \n");
+  source.append("             s_compaction_list_exc[tid] = 0;  \n");
+  source.append("         }  \n");
+  source.append("         else  \n");
+  source.append("         {  \n");
+
+          // number of eigenvalues after the split less than mid
+  source.append("             *mid_count = *left_count + (multiplicity >> 1);  \n");
+
+          // store left interval
+  source.append("             s_left[tid] = *left;  \n");
+  source.append("             s_right[tid] = *right;  \n");
+  source.append("             s_left_count[tid] = *left_count;  \n");
+  source.append("             s_right_count[tid] = *mid_count;  \n");
+  source.append("             *mid = *left;  \n");
+
+          // mark that second child interval exists
+  source.append("             *is_active_second = 1;  \n");
+  source.append("             s_compaction_list_exc[tid] = 1;  \n");
+  source.append("             *compact_second_chunk = 1;  \n");
+  source.append("         }  \n");
+  source.append("     }  \n");
+  }
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Subdivide interval if active and not already converged
+  //
+  // tid                    id of thread
+  // s_left                 shared memory storage for left interval limits
+  // s_right                shared memory storage for right interval limits
+  // s_left_count           shared memory storage for number of eigenvalues less than left interval limits
+  // s_right_count          shared memory storage for number of eigenvalues less than right interval limits
+  // num_threads_active     number of active threads in warp
+  // left                   lower limit of interval
+  // right                  upper limit of interval
+  // left_count             eigenvalues less than \a left
+  // right_count            eigenvalues less than \a right
+  // all_threads_converged  shared memory flag if all threads are converged
+  ///////////////////////////////////////////////////////////////////////////////
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_subdivideActiveInterval(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     subdivideActiveIntervalMulti(const unsigned int tid,  \n");
+  source.append("                             __local "); source.append(numeric_string); source.append(" *s_left,    \n");
+  source.append("                             __local "); source.append(numeric_string); source.append(" *s_right,   \n");
+  source.append("                             __local unsigned int *s_left_count,   \n");
+  source.append("                             __local unsigned int *s_right_count,  \n");
+  source.append("                             const unsigned int num_threads_active,  \n");
+  source.append("                              "); source.append(numeric_string); source.append(" *left,   \n");
+  source.append("                              "); source.append(numeric_string); source.append(" *right,   \n");
+  source.append("                             unsigned int *left_count, unsigned int *right_count,  \n");
+  source.append("                              "); source.append(numeric_string); source.append(" *mid,    \n");
+  source.append("                              __local unsigned int *all_threads_converged)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+      // for all active threads
+  source.append("         if (tid < num_threads_active)  \n");
+  source.append("         {  \n");
+
+  source.append("             *left = s_left[tid];  \n");
+  source.append("             *right = s_right[tid];  \n");
+  source.append("             *left_count = s_left_count[tid];  \n");
+  source.append("             *right_count = s_right_count[tid];  \n");
+
+          // check if thread already converged
+  source.append("             if (*left != *right)  \n");
+  source.append("             {  \n");
+
+  source.append("                 *mid = computeMidpoint(*left, *right);  \n");
+  source.append("                 *all_threads_converged = 0;  \n");
+  source.append("             }  \n");
+  source.append("             else if ((*right_count - *left_count) > 1)  \n");
+  source.append("             {  \n");
+              // mark as not converged if multiple eigenvalues enclosed
+              // duplicate interval in storeIntervalsConverged()
+  source.append("                 *all_threads_converged = 0;  \n");
+  source.append("             }  \n");
+
+  source.append("         }    \n");
+  // end for all active threads
+  source.append("     }  \n");
+  }
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_subdivideActiveIntervalShort(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     subdivideActiveIntervalShort(const unsigned int tid,  \n");
+  source.append("                             __local "); source.append(numeric_string); source.append(" *s_left,    \n");
+  source.append("                             __local "); source.append(numeric_string); source.append(" *s_right,   \n");
+  source.append("                             __local unsigned short *s_left_count,   \n");
+  source.append("                             __local unsigned short *s_right_count,  \n");
+  source.append("                             const unsigned int num_threads_active,  \n");
+  source.append("                             "); source.append(numeric_string); source.append(" *left,   \n");
+  source.append("                             "); source.append(numeric_string); source.append(" *right,   \n");
+  source.append("                             unsigned int *left_count, unsigned int *right_count,  \n");
+  source.append("                             "); source.append(numeric_string); source.append(" *mid,    \n");
+  source.append("                             __local unsigned int *all_threads_converged)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+      // for all active threads
+  source.append("         if (tid < num_threads_active)  \n");
+  source.append("         {  \n");
+
+  source.append("             *left = s_left[tid];  \n");
+  source.append("             *right = s_right[tid];  \n");
+  source.append("             *left_count = s_left_count[tid];  \n");
+  source.append("             *right_count = s_right_count[tid];  \n");
+
+          // check if thread already converged
+  source.append("             if (*left != *right)  \n");
+  source.append("             {  \n");
+
+  source.append("                 *mid = computeMidpoint(*left, *right);  \n");
+  source.append("                 *all_threads_converged = 0;  \n");
+  source.append("             }  \n");
+
+  source.append("         }    \n");
+  // end for all active threads
+  source.append("     }  \n");
+  }
+
+  // end of utilities
+  // start of kernels
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
+  //
+  // g_d             diagonal elements in global memory
+  // g_s             superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed an equals 0
+  // n               size of matrix
+  // lg              lower bound of input interval (e.g. Gerschgorin interval)
+  // ug              upper bound of input interval (e.g. Gerschgorin interval)
+  // lg_eig_count    number of eigenvalues that are smaller than \a lg
+  // lu_eig_count    number of eigenvalues that are smaller than \a lu
+  // epsilon         desired accuracy of eigenvalues to compute
+  ////////////////////////////////////////////////////////////////////////////////
+  ///
+  template <typename StringType>
+  void generate_bisect_kernel_bisectKernel(StringType & source, std::string const & numeric_string)
+  {
+      source.append("     __kernel  \n");
+      source.append("     void  \n");
+      source.append("     bisectKernelSmall(__global "); source.append(numeric_string); source.append(" *g_d,   \n");
+      source.append("                  __global "); source.append(numeric_string); source.append(" *g_s,   \n");
+      source.append("                  const unsigned int n,  \n");
+      source.append("                  __global "); source.append(numeric_string); source.append(" *g_left,   \n");
+      source.append("                  __global "); source.append(numeric_string); source.append(" *g_right,  \n");
+      source.append("                  __global unsigned int *g_left_count, __global unsigned int *g_right_count,  \n");
+      source.append("                  const "); source.append(numeric_string); source.append(" lg,  \n");
+      source.append("                  const "); source.append(numeric_string); source.append(" ug,  \n");
+      source.append("                  const unsigned int lg_eig_count, const unsigned int ug_eig_count, \n");
+      source.append("                  "); source.append(numeric_string); source.append(" epsilon  \n");
+      source.append("                 )  \n");
+      source.append("     {  \n");
+      source.append("         g_s = g_s + 1; \n");
+      source.append("         uint glb_id = get_global_id(0); \n");
+      source.append("         uint grp_id = get_group_id(0); \n");
+      source.append("         uint grp_nm = get_num_groups(0); \n");
+      source.append("         uint lcl_id = get_local_id(0); \n");
+      source.append("         uint lcl_sz = get_local_size(0); \n");
+
+          // intervals (store left and right because the subdivision tree is in general
+          // not dense
+      source.append("         __local "); source.append(numeric_string); source.append(" s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
+      source.append("         __local "); source.append(numeric_string); source.append(" s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
+
+          // number of eigenvalues that are smaller than s_left / s_right
+          // (correspondence is realized via indices)
+      source.append("         __local  unsigned int  s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
+      source.append("         __local  unsigned int  s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
+
+          // helper for stream compaction
+      source.append("         __local  unsigned int  \n");
+      source.append("           s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1];  \n");
+
+          // state variables for whole block
+          // if 0 then compaction of second chunk of child intervals is not necessary
+          // (because all intervals had exactly one non-dead child)
+      source.append("         __local  unsigned int compact_second_chunk;  \n");
+      source.append("         __local  unsigned int all_threads_converged;  \n");
+
+          // number of currently active threads
+      source.append("         __local  unsigned int num_threads_active;  \n");
+
+          // number of threads to use for stream compaction
+      source.append("         __local  unsigned int num_threads_compaction;  \n");
+
+          // helper for exclusive scan
+      source.append("         __local unsigned int *s_compaction_list_exc = s_compaction_list + 1;  \n");
+
+
+          // variables for currently processed interval
+          // left and right limit of active interval
+      source.append("          "); source.append(numeric_string); source.append(" left = 0.0f;  \n");
+      source.append("          "); source.append(numeric_string); source.append(" right = 0.0f;  \n");
+      source.append("         unsigned int left_count = 0;  \n");
+      source.append("         unsigned int right_count = 0;  \n");
+          // midpoint of active interval
+      source.append("          "); source.append(numeric_string); source.append(" mid = 0.0f;  \n");
+          // number of eigenvalues smaller then mid
+      source.append("         unsigned int mid_count = 0;  \n");
+          // affected from compaction
+      source.append("         unsigned int  is_active_second = 0;  \n");
+
+      source.append("         s_compaction_list[lcl_id] = 0;  \n");
+      source.append("         s_left[lcl_id] = 0.0;  \n");
+      source.append("         s_right[lcl_id] = 0.0;  \n");
+      source.append("         s_left_count[lcl_id] = 0;  \n");
+      source.append("         s_right_count[lcl_id] = 0;  \n");
+
+      source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+          // set up initial configuration
+      source.append("         if (0 == lcl_id)  \n");
+      source.append("         {  \n");
+      source.append("             s_left[0] = lg;  \n");
+      source.append("             s_right[0] = ug;  \n");
+      source.append("             s_left_count[0] = lg_eig_count;  \n");
+      source.append("             s_right_count[0] = ug_eig_count;  \n");
+
+      source.append("             compact_second_chunk = 0;  \n");
+      source.append("             num_threads_active = 1;  \n");
+
+      source.append("             num_threads_compaction = 1;  \n");
+      source.append("         }  \n");
+
+          // for all active threads read intervals from the last level
+          // the number of (worst case) active threads per level l is 2^l
+
+      source.append("         while (true)  \n");
+      source.append("         {  \n");
+
+      source.append("             all_threads_converged = 1;  \n");
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      source.append("             is_active_second = 0;  \n");
+      source.append("             subdivideActiveIntervalMulti(lcl_id,  \n");
+      source.append("                                     s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                     num_threads_active,  \n");
+      source.append("                                     &left, &right, &left_count, &right_count,  \n");
+      source.append("                                     &mid, &all_threads_converged);  \n");
+   //   source.append("             output[lcl_id] = s_left;  \n");
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // check if done
+      source.append("             if (1 == all_threads_converged)  \n");
+      source.append("             {  \n");
+      source.append("                 break;  \n");
+      source.append("             }  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // compute number of eigenvalues smaller than mid
+              // use all threads for reading the necessary matrix data from global
+              // memory
+              // use s_left and s_right as scratch space for diagonal and
+              // superdiagonal of matrix
+      source.append("             mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid,  \n");
+      source.append("                                                    lcl_id, num_threads_active,  \n");
+      source.append("                                                    s_left, s_right,  \n");
+      source.append("                                                    (left == right));  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // store intervals
+              // for all threads store the first child interval in a continuous chunk of
+              // memory, and the second child interval -- if it exists -- in a second
+              // chunk; it is likely that all threads reach convergence up to
+              // \a epsilon at the same level; furthermore, for higher level most / all
+              // threads will have only one child, storing the first child compactly will
+              // (first) avoid to perform a compaction step on the first chunk, (second)
+              // make it for higher levels (when all threads / intervals have
+              // exactly one child)  unnecessary to perform a compaction of the second
+              // chunk
+      source.append("             if (lcl_id < num_threads_active)  \n");
+      source.append("             {  \n");
+
+      source.append("                 if (left != right)  \n");
+      source.append("                 {  \n");
+
+                      // store intervals
+      source.append("                     storeNonEmptyIntervals(lcl_id, num_threads_active,  \n");
+      source.append("                                            s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                            left, mid, right,  \n");
+      source.append("                                            left_count, mid_count, right_count,  \n");
+      source.append("                                            epsilon, &compact_second_chunk,  \n");
+      source.append("                                            s_compaction_list_exc,  \n");
+      source.append("                                            &is_active_second);  \n");
+      source.append("                 }  \n");
+      source.append("                 else  \n");
+      source.append("                 {  \n");
+
+      source.append("                     storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                            &left, &mid, &right,  \n");
+      source.append("                                            &left_count, &mid_count, &right_count,  \n");
+      source.append("                                            s_compaction_list_exc, &compact_second_chunk,  \n");
+      source.append("                                            num_threads_active,  \n");
+      source.append("                                            &is_active_second);  \n");
+      source.append("                 }  \n");
+      source.append("             }  \n");
+
+              // necessary so that compact_second_chunk is up-to-date
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // perform compaction of chunk where second children are stored
+              // scan of (num_threads_actieigenvaluesve / 2) elements, thus at most
+              // (num_threads_active / 4) threads are needed
+      source.append("             if (compact_second_chunk > 0)  \n");
+      source.append("             {  \n");
+
+      source.append("                 createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);  \n");
+
+      source.append("                 compactIntervals(s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                  mid, right, mid_count, right_count,  \n");
+      source.append("                                  s_compaction_list, num_threads_active,  \n");
+      source.append("                                  is_active_second);  \n");
+      source.append("             }  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      source.append("             if (0 == lcl_id)  \n");
+      source.append("             {  \n");
+
+                  // update number of active threads with result of reduction
+      source.append("                 num_threads_active += s_compaction_list[num_threads_active];  \n");
+
+      source.append("                 num_threads_compaction = ceilPow2(num_threads_active);  \n");
+
+      source.append("                 compact_second_chunk = 0;  \n");
+      source.append("             }  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      source.append("         }  \n");
+
+      source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+          // write resulting intervals to global mem
+          // for all threads write if they have been converged to an eigenvalue to
+          // a separate array
+
+          // at most n valid intervals
+      source.append("         if (lcl_id < n)  \n");
+      source.append("         {  \n");
+              // intervals converged so left and right limit are identical
+      source.append("             g_left[lcl_id]  = s_left[lcl_id];  \n");
+              // left count is sufficient to have global order
+      source.append("             g_left_count[lcl_id]  = s_left_count[lcl_id];  \n");
+      source.append("         }  \n");
+      source.append("     }  \n");
+     }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Perform second step of bisection algorithm for large matrices for intervals that after the first step contained more than one eigenvalue
+  //
+  // g_d              diagonal elements of symmetric, tridiagonal matrix
+  // g_s              superdiagonal elements of symmetric, tridiagonal matrix
+  // n                matrix size
+  // blocks_mult      start addresses of blocks of intervals that are processed by one block of threads, each of the intervals contains more than one eigenvalue
+  // blocks_mult_sum  total number of eigenvalues / singleton intervals in one block of intervals
+  // g_left           left limits of intervals
+  // g_right          right limits of intervals
+  // g_left_count     number of eigenvalues less than left limits
+  // g_right_count    number of eigenvalues less than right limits
+  // g_lambda         final eigenvalue
+  // g_pos            index of eigenvalue (in ascending order)
+  // precision         desired precision of eigenvalues
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType & source, std::string const & numeric_string)
+  {
+      source.append("     __kernel  \n");
+      source.append("     void  \n");
+      source.append("     bisectKernelLarge_MultIntervals(__global "); source.append(numeric_string); source.append(" *g_d,   \n");
+      source.append("                                     __global "); source.append(numeric_string); source.append(" *g_s,   \n");
+      source.append("                                     const unsigned int n,  \n");
+      source.append("                                     __global unsigned int *blocks_mult,  \n");
+      source.append("                                     __global unsigned int *blocks_mult_sum,  \n");
+      source.append("                                     __global "); source.append(numeric_string); source.append(" *g_left,   \n");
+      source.append("                                     __global "); source.append(numeric_string); source.append(" *g_right,  \n");
+      source.append("                                     __global unsigned int *g_left_count,  \n");
+      source.append("                                     __global unsigned int *g_right_count,  \n");
+      source.append("                                     __global  "); source.append(numeric_string); source.append(" *g_lambda, \n");
+      source.append("                                     __global unsigned int *g_pos,  \n");
+      source.append("                                     "); source.append(numeric_string); source.append(" precision  \n");
+      source.append("                                    )  \n");
+      source.append("     {  \n");
+      source.append("         g_s = g_s + 1; \n");
+      source.append("         uint glb_id = get_global_id(0); \n");
+      source.append("         uint grp_id = get_group_id(0); \n");
+      source.append("         uint grp_nm = get_num_groups(0); \n");
+      source.append("         uint lcl_id = get_local_id(0); \n");
+      source.append("         uint lcl_sz = get_local_size(0); \n");
+
+      source.append("       const unsigned int tid = lcl_id;  \n");
+
+          // left and right limits of interval
+      source.append("         __local "); source.append(numeric_string); source.append(" s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
+      source.append("         __local "); source.append(numeric_string); source.append(" s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
+
+          // number of eigenvalues smaller than interval limits
+      source.append("         __local  unsigned int  s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
+      source.append("         __local  unsigned int  s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
+
+          // helper array for chunk compaction of second chunk
+      source.append("         __local  unsigned int  s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];  \n");
+          // compaction list helper for exclusive scan
+      source.append("         __local unsigned int *s_compaction_list_exc = s_compaction_list + 1;  \n");
+
+          // flag if all threads are converged
+      source.append("         __local  unsigned int  all_threads_converged;  \n");
+          // number of active threads
+      source.append("         __local  unsigned int  num_threads_active;  \n");
+          // number of threads to employ for compaction
+      source.append("         __local  unsigned int  num_threads_compaction;  \n");
+          // flag if second chunk has to be compacted
+      source.append("         __local  unsigned int compact_second_chunk;  \n");
+
+          // parameters of block of intervals processed by this block of threads
+      source.append("         __local  unsigned int  c_block_start;  \n");
+      source.append("         __local  unsigned int  c_block_end;  \n");
+      source.append("         __local  unsigned int  c_block_offset_output;  \n");
+
+          // midpoint of currently active interval of the thread
+      source.append("         "); source.append(numeric_string); source.append(" mid = 0.0f;  \n");
+          // number of eigenvalues smaller than \a mid
+      source.append("         unsigned int  mid_count = 0;  \n");
+          // current interval parameter
+      source.append("         "); source.append(numeric_string); source.append(" left = 0.0f;  \n");
+      source.append("         "); source.append(numeric_string); source.append(" right = 0.0f;  \n");
+      source.append("         unsigned int  left_count = 0;  \n");
+      source.append("         unsigned int  right_count = 0;  \n");
+          // helper for compaction, keep track which threads have a second child
+      source.append("         unsigned int  is_active_second = 0;  \n");
+
+      source.append("         barrier(CLK_LOCAL_MEM_FENCE);            \n");
+
+          // initialize common start conditions
+      source.append("         if (0 == tid)  \n");
+      source.append("         {  \n");
+
+      source.append("             c_block_start = blocks_mult[grp_id];  \n");
+      source.append("             c_block_end = blocks_mult[grp_id + 1];  \n");
+      source.append("             c_block_offset_output = blocks_mult_sum[grp_id];  \n");
+      source.append("               \n");
+
+      source.append("             num_threads_active = c_block_end - c_block_start;  \n");
+      source.append("             s_compaction_list[0] = 0;  \n");
+      source.append("             num_threads_compaction = ceilPow2(num_threads_active);  \n");
+
+      source.append("             all_threads_converged = 1;  \n");
+      source.append("             compact_second_chunk = 0;  \n");
+      source.append("         }  \n");
+      source.append("          s_left_count [tid] = 42;  \n");
+      source.append("          s_right_count[tid] = 42;  \n");
+      source.append("          s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
+      source.append("          s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
+      source.append("           \n");
+      source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+      source.append("           \n");
+
+          // read data into shared memory
+      source.append("         if (tid < num_threads_active)  \n");
+      source.append("         {  \n");
+
+      source.append("             s_left[tid]  = g_left[c_block_start + tid];  \n");
+      source.append("             s_right[tid] = g_right[c_block_start + tid];  \n");
+      source.append("             s_left_count[tid]  = g_left_count[c_block_start + tid];  \n");
+      source.append("             s_right_count[tid] = g_right_count[c_block_start + tid];  \n");
+      source.append("               \n");
+      source.append("         }  \n");
+      source.append("        \n");
+      source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+      source.append("         unsigned int iter = 0;  \n");
+          // do until all threads converged
+      source.append("         while (true)  \n");
+      source.append("         {  \n");
+      source.append("             iter++;  \n");
+              //for (int iter=0; iter < 0; iter++) {
+      source.append("             s_compaction_list[lcl_id] = 0;  \n");
+      source.append("             s_compaction_list[lcl_id + lcl_sz] = 0;  \n");
+      source.append("             s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
+
+              // subdivide interval if currently active and not already converged
+      source.append("             subdivideActiveIntervalMulti(tid, s_left, s_right,  \n");
+      source.append("                                     s_left_count, s_right_count,  \n");
+      source.append("                                     num_threads_active,  \n");
+      source.append("                                     &left, &right, &left_count, &right_count,  \n");
+      source.append("                                     &mid, &all_threads_converged);  \n");
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // stop if all eigenvalues have been found
+      source.append("             if (1 == all_threads_converged)  \n");
+      source.append("             {  \n");
+      source.append("                  \n");
+      source.append("                 break;  \n");
+      source.append("             }  \n");
+
+              // compute number of eigenvalues smaller than mid for active and not
+              // converged intervals, use all threads for loading data from gmem and
+              // s_left and s_right as scratch space to store the data load from gmem
+              // in shared memory
+      source.append("             mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,  \n");
+      source.append("                                                         mid, tid, num_threads_active,  \n");
+      source.append("                                                         s_left, s_right,  \n");
+      source.append("                                                         (left == right));  \n");
+      source.append("                                                \n");
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      source.append("             if (tid < num_threads_active)  \n");
+      source.append("             {  \n");
+      source.append("                   \n");
+                  // store intervals
+      source.append("                 if (left != right)  \n");
+      source.append("                 {  \n");
+
+      source.append("                     storeNonEmptyIntervals(tid, num_threads_active,  \n");
+      source.append("                                            s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                            left, mid, right,  \n");
+      source.append("                                            left_count, mid_count, right_count,  \n");
+      source.append("                                            precision, &compact_second_chunk,  \n");
+      source.append("                                            s_compaction_list_exc,  \n");
+      source.append("                                            &is_active_second);  \n");
+      source.append("                      \n");
+      source.append("                 }  \n");
+      source.append("                 else  \n");
+      source.append("                 {  \n");
+
+      source.append("                     storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                            &left, &mid, &right,  \n");
+      source.append("                                            &left_count, &mid_count, &right_count,  \n");
+      source.append("                                            s_compaction_list_exc, &compact_second_chunk,  \n");
+      source.append("                                            num_threads_active,  \n");
+      source.append("                                            &is_active_second);  \n");
+      source.append("                   \n");
+      source.append("                 }  \n");
+      source.append("             }  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // compact second chunk of intervals if any of the threads generated
+              // two child intervals
+      source.append("             if (1 == compact_second_chunk)  \n");
+      source.append("             {  \n");
+
+      source.append("                 createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);  \n");
+      source.append("                 compactIntervals(s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                  mid, right, mid_count, right_count,  \n");
+      source.append("                                  s_compaction_list, num_threads_active,  \n");
+      source.append("                                  is_active_second);  \n");
+      source.append("             }  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // update state variables
+      source.append("             if (0 == tid)  \n");
+      source.append("             {  \n");
+      source.append("                 num_threads_active += s_compaction_list[num_threads_active];  \n");
+      source.append("                 num_threads_compaction = ceilPow2(num_threads_active);  \n");
+
+      source.append("                 compact_second_chunk = 0;  \n");
+      source.append("                 all_threads_converged = 1;  \n");
+      source.append("             }  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // clear
+      source.append("             s_compaction_list_exc[lcl_id] = 0;  \n");
+      source.append("             s_compaction_list_exc[lcl_id + lcl_sz] = 0;   \n");
+      source.append("               \n");
+      source.append("             if (num_threads_compaction > lcl_sz)              \n");
+      source.append("             {  \n");
+      source.append("               break;  \n");
+      source.append("             }  \n");
+
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      source.append("    } \n"); // end until all threads converged
+
+          // write data back to global memory
+      source.append("         if (tid < num_threads_active)  \n");
+      source.append("         {  \n");
+
+      source.append("             unsigned int addr = c_block_offset_output + tid;  \n");
+      source.append("               \n");
+      source.append("             g_lambda[addr]  = s_left[tid];  \n");
+      source.append("             g_pos[addr]   = s_right_count[tid];  \n");
+      source.append("         }  \n");
+      source.append("     }  \n");
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Determine eigenvalues for large matrices for intervals that after the first step contained one eigenvalue
+  //
+  // g_d            diagonal elements of symmetric, tridiagonal matrix
+  // g_s            superdiagonal elements of symmetric, tridiagonal matrix
+  // n              matrix size
+  // num_intervals  total number of intervals containing one eigenvalue after the first step
+  // g_left         left interval limits
+  // g_right        right interval limits
+  // g_pos          index of interval / number of intervals that are smaller than right interval limit
+  // precision      desired precision of eigenvalues
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType & source, std::string const & numeric_string)
+  {
+      source.append("     __kernel  \n");
+      source.append("     void  \n");
+      source.append("     bisectKernelLarge_OneIntervals(__global "); source.append(numeric_string); source.append(" *g_d,   \n");
+      source.append("                                    __global "); source.append(numeric_string); source.append(" *g_s,    \n");
+      source.append("                                    const unsigned int n,  \n");
+      source.append("                                    unsigned int num_intervals,  \n");
+      source.append("                                    __global "); source.append(numeric_string); source.append(" *g_left,  \n");
+      source.append("                                    __global "); source.append(numeric_string); source.append(" *g_right,  \n");
+      source.append("                                    __global unsigned int *g_pos,  \n");
+      source.append("                                    "); source.append(numeric_string); source.append(" precision)  \n");
+      source.append("     {  \n");
+      source.append("         g_s = g_s + 1; \n");
+      source.append("         uint glb_id = get_global_id(0); \n");
+      source.append("         uint grp_id = get_group_id(0); \n");
+      source.append("         uint grp_nm = get_num_groups(0); \n");
+      source.append("         uint lcl_id = get_local_id(0); \n");
+      source.append("         uint lcl_sz = get_local_size(0); \n");
+      source.append("         const unsigned int gtid = (lcl_sz * grp_id) + lcl_id;  \n");
+      source.append("         __loca

<TRUNCATED>

[38/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
new file mode 100755
index 0000000..960f5c2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
@@ -0,0 +1,180 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
+    @brief Determine eigenvalues for large matrices for intervals that contained after the first step one eigenvalue
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+// additional kernel
+#include "viennacl/linalg/cuda/bisect_util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Determine eigenvalues for large matrices for intervals that after
+//! the first step contained one eigenvalue
+//! @param  g_d  diagonal elements of symmetric, tridiagonal matrix
+//! @param  g_s  superdiagonal elements of symmetric, tridiagonal matrix
+//! @param  n    matrix size
+//! @param  num_intervals  total number of intervals containing one eigenvalue
+//!                         after the first step
+//! @param g_left  left interval limits
+//! @param g_right  right interval limits
+//! @param g_pos  index of interval / number of intervals that are smaller than
+//!               right interval limit
+//! @param  precision  desired precision of eigenvalues
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__global__
+void
+bisectKernelLarge_OneIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
+                               unsigned int num_intervals,
+                               NumericT *g_left, NumericT *g_right,
+                               unsigned int *g_pos,
+                               NumericT  precision)
+{
+
+  const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+  __shared__  NumericT  s_left_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK];
+  __shared__  NumericT  s_right_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK];
+
+  // active interval of thread
+  // left and right limit of current interval
+  NumericT left, right;
+  // number of threads smaller than the right limit (also corresponds to the
+  // global index of the eigenvalues contained in the active interval)
+  unsigned int right_count;
+  // flag if current thread converged
+  unsigned int converged = 0;
+  // midpoint when current interval is subdivided
+  NumericT mid = 0.0f;
+  // number of eigenvalues less than mid
+  unsigned int mid_count = 0;
+
+  // read data from global memory
+  if (gtid < num_intervals)
+  {
+    left = g_left[gtid];
+    right = g_right[gtid];
+    right_count = g_pos[gtid];
+  }
+
+
+  // flag to determine if all threads converged to eigenvalue
+  __shared__  unsigned int  converged_all_threads;
+
+  // initialized shared flag
+  if (0 == threadIdx.x)
+  {
+    converged_all_threads = 0;
+  }
+
+  __syncthreads();
+
+  // process until all threads converged to an eigenvalue
+  while (true)
+  {
+
+    converged_all_threads = 1;
+
+    // update midpoint for all active threads
+    if ((gtid < num_intervals) && (0 == converged))
+    {
+      mid = computeMidpoint(left, right);
+    }
+
+    // find number of eigenvalues that are smaller than midpoint
+    mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
+                                                mid, gtid, num_intervals,
+                                                s_left_scratch,
+                                                s_right_scratch,
+                                                converged);
+
+    __syncthreads();
+
+    // for all active threads
+    if ((gtid < num_intervals) && (0 == converged))
+    {
+
+      // update intervals -- always one child interval survives
+      if (right_count == mid_count)
+      {
+        right = mid;
+      }
+      else
+      {
+        left = mid;
+      }
+
+      // check for convergence
+      NumericT t0 = right - left;
+      NumericT t1 = max(abs(right), abs(left)) * precision;
+
+      if (t0 < min(precision, t1))
+      {
+        NumericT lambda = computeMidpoint(left, right);
+        left = lambda;
+        right = lambda;
+
+        converged = 1;
+      }
+      else
+      {
+        converged_all_threads = 0;
+      }
+    }
+
+    __syncthreads();
+
+    if (1 == converged_all_threads)
+    {
+      break;
+    }
+
+    __syncthreads();
+  }
+
+  // write data back to global memory
+  __syncthreads();
+
+  if (gtid < num_intervals)
+  {
+      // intervals converged so left and right interval limit are both identical
+      // and identical to the eigenvalue
+      g_left[gtid] = left;
+  }
+}
+} // namespace cuda
+} // namespace linalg
+} // namespace viennacl
+#endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp
new file mode 100755
index 0000000..310b381
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp
@@ -0,0 +1,261 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_small.hpp
+    @brief Determine eigenvalues for small symmetric, tridiagonal matrix
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+// additional kernel
+#include "viennacl/linalg/cuda/bisect_util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+/** @brief Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
+*
+* @param  g_d  diagonal elements in global memory
+* @param  g_s  superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed an equals 0
+* @param  n    size of matrix
+* @param  g_left         helper array
+* @param  g_right        helper array
+* @param  g_left_count   helper array
+* @param  g_right_count  helper array
+* @param  lg             lower bound of input interval (e.g. Gerschgorin interval)
+* @param  ug             upper bound of input interval (e.g. Gerschgorin interval)
+* @param  lg_eig_count   number of eigenvalues that are smaller than lg
+* @param  ug_eig_count   number of eigenvalues that are smaller than lu
+* @param  epsilon        desired accuracy of eigenvalues to compute
+*/
+template<typename NumericT>
+__global__
+void
+bisectKernelSmall(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
+             NumericT * g_left, NumericT *g_right,
+             unsigned int *g_left_count, unsigned int *g_right_count,
+             const NumericT lg, const NumericT ug,
+             const unsigned int lg_eig_count, const unsigned int ug_eig_count,
+             NumericT epsilon
+            )
+{
+    // intervals (store left and right because the subdivision tree is in general
+    // not dense
+    __shared__  NumericT  s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
+    __shared__  NumericT  s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
+
+    // number of eigenvalues that are smaller than s_left / s_right
+    // (correspondence is realized via indices)
+    __shared__  unsigned int  s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
+    __shared__  unsigned int  s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
+
+    // helper for stream compaction
+    __shared__  unsigned int
+    s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1];
+
+    // state variables for whole block
+    // if 0 then compaction of second chunk of child intervals is not necessary
+    // (because all intervals had exactly one non-dead child)
+    __shared__  unsigned int compact_second_chunk;
+    __shared__  unsigned int all_threads_converged;
+
+    // number of currently active threads
+    __shared__  unsigned int num_threads_active;
+
+    // number of threads to use for stream compaction
+    __shared__  unsigned int num_threads_compaction;
+
+    // helper for exclusive scan
+    unsigned int *s_compaction_list_exc = s_compaction_list + 1;
+
+
+    // variables for currently processed interval
+    // left and right limit of active interval
+    NumericT  left = 0.0f;
+    NumericT  right = 0.0f;
+    unsigned int left_count = 0;
+    unsigned int right_count = 0;
+    // midpoint of active interval
+    NumericT  mid = 0.0f;
+    // number of eigenvalues smaller then mid
+    unsigned int mid_count = 0;
+    // affected from compaction
+    unsigned int  is_active_second = 0;
+
+    s_compaction_list[threadIdx.x] = 0;
+    s_left[threadIdx.x] = 0;
+    s_right[threadIdx.x] = 0;
+    s_left_count[threadIdx.x] = 0;
+    s_right_count[threadIdx.x] = 0;
+
+    __syncthreads();
+
+    // set up initial configuration
+    if (0 == threadIdx.x)
+    {
+        s_left[0] = lg;
+        s_right[0] = ug;
+        s_left_count[0] = lg_eig_count;
+        s_right_count[0] = ug_eig_count;
+
+        compact_second_chunk = 0;
+        num_threads_active = 1;
+
+        num_threads_compaction = 1;
+    }
+
+    // for all active threads read intervals from the last level
+    // the number of (worst case) active threads per level l is 2^l
+    while (true)
+    {
+
+        all_threads_converged = 1;
+        __syncthreads();
+
+        is_active_second = 0;
+        subdivideActiveIntervalMulti(threadIdx.x,
+                                s_left, s_right, s_left_count, s_right_count,
+                                num_threads_active,
+                                left, right, left_count, right_count,
+                                mid, all_threads_converged);
+
+        __syncthreads();
+
+        // check if done
+        if (1 == all_threads_converged)
+        {
+            break;
+        }
+
+        __syncthreads();
+
+        // compute number of eigenvalues smaller than mid
+        // use all threads for reading the necessary matrix data from global
+        // memory
+        // use s_left and s_right as scratch space for diagonal and
+        // superdiagonal of matrix
+        mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid,
+                                               threadIdx.x, num_threads_active,
+                                               s_left, s_right,
+                                               (left == right));
+
+        __syncthreads();
+
+        // store intervals
+        // for all threads store the first child interval in a continuous chunk of
+        // memory, and the second child interval -- if it exists -- in a second
+        // chunk; it is likely that all threads reach convergence up to
+        // \a epsilon at the same level; furthermore, for higher level most / all
+        // threads will have only one child, storing the first child compactly will
+        // (first) avoid to perform a compaction step on the first chunk, (second)
+        // make it for higher levels (when all threads / intervals have
+        // exactly one child)  unnecessary to perform a compaction of the second
+        // chunk
+        if (threadIdx.x < num_threads_active)
+        {
+
+            if (left != right)
+            {
+
+                // store intervals
+                storeNonEmptyIntervals(threadIdx.x, num_threads_active,
+                                       s_left, s_right, s_left_count, s_right_count,
+                                       left, mid, right,
+                                       left_count, mid_count, right_count,
+                                       epsilon, compact_second_chunk,
+                                       s_compaction_list_exc,
+                                       is_active_second);
+            }
+            else
+            {
+
+                storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,
+                                       left, mid, right,
+                                       left_count, mid_count, right_count,
+                                       s_compaction_list_exc, compact_second_chunk,
+                                       num_threads_active,
+                                       is_active_second);
+            }
+        }
+
+        // necessary so that compact_second_chunk is up-to-date
+        __syncthreads();
+
+        // perform compaction of chunk where second children are stored
+        // scan of (num_threads_active / 2) elements, thus at most
+        // (num_threads_active / 4) threads are needed
+        if (compact_second_chunk > 0)
+        {
+
+            createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);
+
+            compactIntervals(s_left, s_right, s_left_count, s_right_count,
+                             mid, right, mid_count, right_count,
+                             s_compaction_list, num_threads_active,
+                             is_active_second);
+        }
+
+        __syncthreads();
+
+        if (0 == threadIdx.x)
+        {
+
+            // update number of active threads with result of reduction
+            num_threads_active += s_compaction_list[num_threads_active];
+
+            num_threads_compaction = ceilPow2(num_threads_active);
+
+            compact_second_chunk = 0;
+        }
+
+        __syncthreads();
+
+    }
+
+    __syncthreads();
+
+    // write resulting intervals to global mem
+    // for all threads write if they have been converged to an eigenvalue to
+    // a separate array
+
+    // at most n valid intervals
+    if (threadIdx.x < n)
+    {
+
+        // intervals converged so left and right limit are identical
+        g_left[threadIdx.x]  = s_left[threadIdx.x];
+        // left count is sufficient to have global order
+        g_left_count[threadIdx.x]  = s_left_count[threadIdx.x];
+    }
+}
+} // namespace cuda
+} // namespace linalg
+} // namespace viennacl
+#endif // #ifndef _BISECT_KERNEL_SMALL_H_

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp
new file mode 100755
index 0000000..e2e262c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp
@@ -0,0 +1,613 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_small.hpp
+    @brief Utility / shared functionality for bisection kernels
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Compute the next lower power of two of n
+//! @param  n  number for which next higher power of two is seeked
+////////////////////////////////////////////////////////////////////////////////
+__device__
+inline int
+floorPow2(int n)
+{
+
+    // early out if already power of two
+    if (0 == (n & (n-1)))
+    {
+        return n;
+    }
+
+    int exp;
+    frexp((float)n, &exp);
+    return (1 << (exp - 1));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute the next higher power of two of n
+//! @param  n  number for which next higher power of two is seeked
+////////////////////////////////////////////////////////////////////////////////
+__device__
+inline int
+ceilPow2(int n)
+{
+
+    // early out if already power of two
+    if (0 == (n & (n-1)))
+    {
+        return n;
+    }
+
+    int exp;
+    frexp((float)n, &exp);
+    return (1 << exp);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute midpoint of interval [\a left, \a right] avoiding overflow if
+//! possible
+//! @param left   left / lower limit of interval
+//! @param right  right / upper limit of interval
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+inline NumericT
+computeMidpoint(const NumericT left, const NumericT right)
+{
+
+    NumericT mid;
+
+    if (viennacl::linalg::detail::sign_f(left) == viennacl::linalg::detail::sign_f(right))
+    {
+        mid = left + (right - left) * 0.5f;
+    }
+    else
+    {
+        mid = (left + right) * 0.5f;
+    }
+
+    return mid;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Check if interval converged and store appropriately
+//! @param  addr    address where to store the information of the interval
+//! @param  s_left  shared memory storage for left interval limits
+//! @param  s_right  shared memory storage for right interval limits
+//! @param  s_left_count  shared memory storage for number of eigenvalues less
+//!                       than left interval limits
+//! @param  s_right_count  shared memory storage for number of eigenvalues less
+//!                       than right interval limits
+//! @param  left   lower limit of interval
+//! @param  right  upper limit of interval
+//! @param  left_count  eigenvalues less than \a left
+//! @param  right_count  eigenvalues less than \a right
+//! @param  precision  desired precision for eigenvalues
+////////////////////////////////////////////////////////////////////////////////
+template<class S, class T, class NumericT>
+__device__
+void
+storeInterval(unsigned int addr,
+              NumericT *s_left, NumericT *s_right,
+              T *s_left_count, T *s_right_count,
+              NumericT left, NumericT right,
+              S left_count, S right_count,
+              NumericT precision)
+{
+    s_left_count[addr] = left_count;
+    s_right_count[addr] = right_count;
+
+    // check if interval converged
+    NumericT t0 = abs(right - left);
+    NumericT t1 = max(abs(left), abs(right)) * precision;
+
+    if (t0 <= max(static_cast<NumericT>(VIENNACL_BISECT_MIN_ABS_INTERVAL), t1))
+    {
+        // compute mid point
+        NumericT lambda = computeMidpoint(left, right);
+
+        // mark as converged
+        s_left[addr] = lambda;
+        s_right[addr] = lambda;
+    }
+    else
+    {
+
+        // store current limits
+        s_left[addr] = left;
+        s_right[addr] = right;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute number of eigenvalues that are smaller than x given a symmetric,
+//! real, and tridiagonal matrix
+//! @param  g_d  diagonal elements stored in global memory
+//! @param  g_s  superdiagonal elements stored in global memory
+//! @param  n    size of matrix
+//! @param  x    value for which the number of eigenvalues that are smaller is
+//!              seeked
+//! @param  tid  thread identified (e.g. threadIdx.x or gtid)
+//! @param  num_intervals_active  number of active intervals / threads that
+//!                               currently process an interval
+//! @param  s_d  scratch space to store diagonal entries of the tridiagonal
+//!              matrix in shared memory
+//! @param  s_s  scratch space to store superdiagonal entries of the tridiagonal
+//!              matrix in shared memory
+//! @param  converged  flag if the current thread is already converged (that
+//!         is count does not have to be computed)
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+inline unsigned int
+computeNumSmallerEigenvals(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
+                           const NumericT x,
+                           const unsigned int tid,
+                           const unsigned int num_intervals_active,
+                           NumericT *s_d, NumericT *s_s,
+                           unsigned int converged
+                          )
+{
+
+    NumericT  delta = 1.0f;
+    unsigned int count = 0;
+
+    __syncthreads();
+
+    // read data into shared memory
+    if (threadIdx.x < n)
+    {
+        s_d[threadIdx.x] = *(g_d + threadIdx.x);
+        s_s[threadIdx.x] = *(g_s + threadIdx.x - 1);
+    }
+
+    __syncthreads();
+
+    // perform loop only for active threads
+    if ((tid < num_intervals_active) && (0 == converged))
+    {
+
+        // perform (optimized) Gaussian elimination to determine the number
+        // of eigenvalues that are smaller than n
+        for (unsigned int k = 0; k < n; ++k)
+        {
+            delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
+            count += (delta < 0) ? 1 : 0;
+        }
+
+    }  // end if thread currently processing an interval
+
+    return count;
+}
+////////////////////////////////////////////////////////////////////////////////
+//! Compute number of eigenvalues that are smaller than x given a symmetric,
+//! real, and tridiagonal matrix
+//! @param  g_d  diagonal elements stored in global memory
+//! @param  g_s  superdiagonal elements stored in global memory
+//! @param  n    size of matrix
+//! @param  x    value for which the number of eigenvalues that are smaller is
+//!              seeked
+//! @param  tid  thread identified (e.g. threadIdx.x or gtid)
+//! @param  num_intervals_active  number of active intervals / threads that
+//!                               currently process an interval
+//! @param  s_d  scratch space to store diagonal entries of the tridiagonal
+//!              matrix in shared memory
+//! @param  s_s  scratch space to store superdiagonal entries of the tridiagonal
+//!              matrix in shared memory
+//! @param  converged  flag if the current thread is already converged (that
+//!         is count does not have to be computed)
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+inline unsigned int
+computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
+                                const NumericT x,
+                                const unsigned int tid,
+                                const unsigned int num_intervals_active,
+                                NumericT *s_d, NumericT *s_s,
+                                unsigned int converged
+                               )
+{
+    NumericT  delta = 1.0f;
+    unsigned int count = 0;
+
+    unsigned int rem = n;
+
+    // do until whole diagonal and superdiagonal has been loaded and processed
+    for (unsigned int i = 0; i < n; i += blockDim.x)
+    {
+
+        __syncthreads();
+
+        // read new chunk of data into shared memory
+        if ((i + threadIdx.x) < n)
+        {
+
+            s_d[threadIdx.x] = *(g_d + i + threadIdx.x);
+            s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1);
+        }
+
+        __syncthreads();
+
+
+        if (tid < num_intervals_active)
+        {
+
+            // perform (optimized) Gaussian elimination to determine the number
+            // of eigenvalues that are smaller than n
+            for (unsigned int k = 0; k < min(rem,blockDim.x); ++k)
+            {
+                delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
+                // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
+                count += (delta < 0) ? 1 : 0;
+            }
+
+        }  // end if thread currently processing an interval
+
+        rem -= blockDim.x;
+    }
+
+    return count;
+}
+
+/** @brief Store all non-empty intervals resulting from the subdivision of the interval currently processed by the thread.
+*
+* @param  addr                   base address for storing intervals
+* @param  num_threads_active     number of threads / intervals in current sweep
+* @param  s_left                 shared memory storage for left interval limits
+* @param  s_right                shared memory storage for right interval limits
+* @param  s_left_count           shared memory storage for number of eigenvalues less than left interval limits
+* @param  s_right_count          shared memory storage for number of eigenvalues less than right interval limits
+* @param  left                   lower limit of interval
+* @param  mid                    midpoint of interval
+* @param  right                  upper limit of interval
+* @param  left_count             eigenvalues less than \a left
+* @param  mid_count              eigenvalues less than \a mid
+* @param  right_count            eigenvalues less than \a right
+* @param  precision              desired precision for eigenvalues
+* @param  compact_second_chunk   shared mem flag if second chunk is used and ergo requires compaction
+* @param  s_compaction_list_exc  helper array for stream compaction, s_compaction_list_exc[tid] = 1 when the thread generated two child intervals
+* @param  is_active_second       mark is thread has a second non-empty child interval
+*/
+template<class S, class T, class NumericT>
+__device__
+void
+storeNonEmptyIntervals(unsigned int addr,
+                       const unsigned int num_threads_active,
+                       NumericT  *s_left, NumericT *s_right,
+                       T  *s_left_count, T *s_right_count,
+                       NumericT left, NumericT mid, NumericT right,
+                       const S left_count,
+                       const S mid_count,
+                       const S right_count,
+                       NumericT precision,
+                       unsigned int &compact_second_chunk,
+                       T *s_compaction_list_exc,
+                       unsigned int &is_active_second)
+{
+    // check if both child intervals are valid
+
+    if ((left_count != mid_count) && (mid_count != right_count))
+    {
+
+        // store the left interval
+        storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+                      left, mid, left_count, mid_count, precision);
+
+        // mark that a second interval has been generated, only stored after
+        // stream compaction of second chunk
+        is_active_second = 1;
+        s_compaction_list_exc[threadIdx.x] = 1;
+        compact_second_chunk = 1;
+    }
+    else
+    {
+
+        // only one non-empty child interval
+
+        // mark that no second child
+        is_active_second = 0;
+        s_compaction_list_exc[threadIdx.x] = 0;
+
+        // store the one valid child interval
+        if (left_count != mid_count)
+        {
+            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+                          left, mid, left_count, mid_count, precision);
+        }
+        else
+        {
+            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+                          mid, right, mid_count, right_count, precision);
+        }
+
+    }
+}
+////////////////////////////////////////////////////////////////////////////////
+//! Create indices for compaction, that is process \a s_compaction_list_exc
+//! which is 1 for intervals that generated a second child and 0 otherwise
+//! and create for each of the non-zero elements the index where the new
+//! interval belongs to in a compact representation of all generated second
+//! childs
+//! @param   s_compaction_list_exc  list containing the flags which threads
+//!                                 generated two childs
+//! @param   num_threads_compaction number of threads to employ for compaction
+////////////////////////////////////////////////////////////////////////////////
+template<class T>
+__device__
+void
+createIndicesCompaction(T *s_compaction_list_exc,
+                        unsigned int num_threads_compaction)
+{
+
+    unsigned int offset = 1;
+    const unsigned int tid = threadIdx.x;
+   // if(tid == 0)
+     // printf("num_threads_compaction = %u\n", num_threads_compaction);
+
+    // higher levels of scan tree
+    for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
+    {
+
+        __syncthreads();
+
+        if (tid < d)
+        {
+
+            unsigned int  ai = offset*(2*tid+1)-1;
+            unsigned int  bi = offset*(2*tid+2)-1;
+
+            s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]
+                                          + s_compaction_list_exc[ai];
+        }
+
+        offset <<= 1;
+    }
+
+    // traverse down tree: first down to level 2 across
+    for (int d = 2; d < num_threads_compaction; d <<= 1)
+    {
+
+        offset >>= 1;
+        __syncthreads();
+
+        if (tid < (d-1))
+        {
+
+            unsigned int  ai = offset*(tid+1) - 1;
+            unsigned int  bi = ai + (offset >> 1);
+
+            s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]
+                                          + s_compaction_list_exc[ai];
+        }
+    }
+
+    __syncthreads();
+
+}
+
+/** @brief Perform stream compaction for second child intervals
+*
+* @param  s_left              shared memory storage for left interval limits
+* @param  s_right             shared memory storage for right interval limits
+* @param  s_left_count        shared memory storage for number of eigenvalues less than left interval limits
+* @param  s_right_count       shared memory storage for number of eigenvalues less than right interval limits
+* @param  mid                 midpoint of current interval (left of new interval)
+* @param  right               upper limit of interval
+* @param  mid_count           eigenvalues less than mid
+* @param  right_count         eigenvalues less than right
+* @param  s_compaction_list   list containing the indices where the data has to be stored
+* @param  num_threads_active  number of active threads / intervals
+* @param  is_active_second    mark is thread has a second non-empty child interval
+*/
+template<class T, class NumericT>
+__device__
+void
+compactIntervals(NumericT *s_left, NumericT *s_right,
+                 T *s_left_count, T *s_right_count,
+                 NumericT mid, NumericT right,
+                 unsigned int mid_count, unsigned int right_count,
+                 T *s_compaction_list,
+                 unsigned int num_threads_active,
+                 unsigned int is_active_second)
+{
+    const unsigned int tid = threadIdx.x;
+
+    // perform compaction / copy data for all threads where the second
+    // child is not dead
+    if ((tid < num_threads_active) && (1 == is_active_second))
+    {
+        unsigned int addr_w = num_threads_active + s_compaction_list[tid];
+        s_left[addr_w] = mid;
+        s_right[addr_w] = right;
+        s_left_count[addr_w] = mid_count;
+        s_right_count[addr_w] = right_count;
+    }
+}
+
+template<class T, class S, class NumericT>
+__device__
+void
+storeIntervalConverged(NumericT *s_left, NumericT *s_right,
+                       T *s_left_count, T *s_right_count,
+                       NumericT &left, NumericT &mid, NumericT &right,
+                       S &left_count, S &mid_count, S &right_count,
+                       T *s_compaction_list_exc,
+                       unsigned int &compact_second_chunk,
+                       const unsigned int num_threads_active,
+                       unsigned int &is_active_second)
+{
+    const unsigned int tid = threadIdx.x;
+    const unsigned int multiplicity = right_count - left_count;
+    // check multiplicity of eigenvalue
+    if (1 == multiplicity)
+    {
+
+        // just re-store intervals, simple eigenvalue
+        s_left[tid] = left;
+        s_right[tid] = right;
+        s_left_count[tid] = left_count;
+        s_right_count[tid] = right_count;
+
+
+        // mark that no second child / clear
+        is_active_second = 0;
+        s_compaction_list_exc[tid] = 0;
+    }
+    else
+    {
+
+        // number of eigenvalues after the split less than mid
+        mid_count = left_count + (multiplicity >> 1);
+
+        // store left interval
+        s_left[tid] = left;
+        s_right[tid] = right;
+        s_left_count[tid] = left_count;
+        s_right_count[tid] = mid_count;
+        mid = left;
+
+        // mark that second child interval exists
+        is_active_second = 1;
+        s_compaction_list_exc[tid] = 1;
+        compact_second_chunk = 1;
+    }
+}
+
+/** @brief Subdivide interval if active and not already converged.
+*
+* @param  tid                    id of thread
+* @param  s_left                 shared memory storage for left interval limits
+* @param  s_right                shared memory storage for right interval limits
+* @param  s_left_count           shared memory storage for number of eigenvalues less than left interval limits
+* @param  s_right_count          shared memory storage for number of eigenvalues less than right interval limits
+* @param  num_threads_active     number of active threads in warp
+* @param  left                   lower limit of interval
+* @param  right                  upper limit of interval
+* @param  left_count             eigenvalues less than \a left
+* @param  right_count            eigenvalues less than \a right
+* @param  mid                    median of interval
+* @param  all_threads_converged  shared memory flag if all threads are
+*/
+template<class T, class NumericT>
+__device__
+void
+subdivideActiveIntervalMulti(const unsigned int tid,
+                        NumericT *s_left, NumericT *s_right,
+                        T *s_left_count, T *s_right_count,
+                        const unsigned int num_threads_active,
+                        NumericT &left, NumericT &right,
+                        unsigned int &left_count, unsigned int &right_count,
+                        NumericT &mid, unsigned int &all_threads_converged)
+{
+  // for all active threads
+  if (tid < num_threads_active)
+  {
+
+    left = s_left[tid];
+    right = s_right[tid];
+    left_count = s_left_count[tid];
+    right_count = s_right_count[tid];
+
+    // check if thread already converged
+    if (left != right)
+    {
+
+      mid = computeMidpoint(left, right);
+      all_threads_converged = 0;
+    }
+    else if ((right_count - left_count) > 1)
+    {
+      // mark as not converged if multiple eigenvalues enclosed
+      // duplicate interval in storeIntervalsConverged()
+      all_threads_converged = 0;
+    }
+
+  }  // end for all active threads
+}
+
+
+/** @brief Subdivide interval if active and not already converged.
+*
+* @param  tid                    id of thread
+* @param  s_left                 shared memory storage for left interval limits
+* @param  s_right                shared memory storage for right interval limits
+* @param  s_left_count           shared memory storage for number of eigenvalues less than left interval limits
+* @param  s_right_count          shared memory storage for number of eigenvalues less than right interval limits
+* @param  num_threads_active     number of active threads in warp
+* @param  left                   lower limit of interval
+* @param  right                  upper limit of interval
+* @param  left_count             eigenvalues less than \a left
+* @param  right_count            eigenvalues less than \a right
+* @param  mid                    median of interval
+* @param  all_threads_converged  shared memory flag if all threads are
+*/
+template<class T, class NumericT>
+__device__
+void
+subdivideActiveInterval(const unsigned int tid,
+                        NumericT *s_left, NumericT *s_right,
+                        T *s_left_count, T *s_right_count,
+                        const unsigned int num_threads_active,
+                        NumericT &left, NumericT &right,
+                        unsigned int &left_count, unsigned int &right_count,
+                        NumericT &mid, unsigned int &all_threads_converged)
+{
+  // for all active threads
+  if (tid < num_threads_active)
+  {
+
+    left = s_left[tid];
+    right = s_right[tid];
+    left_count = s_left_count[tid];
+    right_count = s_right_count[tid];
+
+    // check if thread already converged
+    if (left != right)
+    {
+
+      mid = computeMidpoint(left, right);
+      all_threads_converged = 0;
+    }
+  }  // end for all active threads
+}
+}
+}
+}
+
+#endif // #ifndef VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp
new file mode 100644
index 0000000..3622b89
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp
@@ -0,0 +1,250 @@
+#ifndef VIENNACL_LINALG_CUDA_COMMON_HPP_
+#define VIENNACL_LINALG_CUDA_COMMON_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/common.hpp
+    @brief Common routines for CUDA execution
+*/
+
+#include <sstream>
+#include <cuda_runtime.h>
+#include "viennacl/backend/cuda.hpp"
+#include "viennacl/traits/handle.hpp"
+
+#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)  detail::cuda_last_error_check (message, __FILE__, __LINE__)
+
+namespace viennacl
+{
+
+////// scalar
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version. */
+template<typename NumericT>
+NumericT * cuda_arg(scalar<NumericT> & obj)
+{
+  return reinterpret_cast<NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Const version. */
+template<typename NumericT>
+const NumericT * cuda_arg(scalar<NumericT> const & obj)
+{
+  return reinterpret_cast<const NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+
+////// vector_base
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base) with implicit return type deduction. Non-const version. */
+template<typename NumericT>
+NumericT * cuda_arg(vector_base<NumericT> & obj)
+{
+  return reinterpret_cast<NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base) with implicit return type deduction. Const version. */
+template<typename NumericT>
+const NumericT * cuda_arg(vector_base<NumericT> const & obj)
+{
+  return reinterpret_cast<const NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base). Return type needs to be explicitly provided as first template argument. Non-const version. */
+template<typename ReturnT, typename NumericT>
+ReturnT * cuda_arg(vector_base<NumericT> & obj)
+{
+  return reinterpret_cast<ReturnT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base). Return type needs to be explicitly provided as first template argument. Const version. */
+template<typename ReturnT, typename NumericT>
+const ReturnT * cuda_arg(vector_base<NumericT> const & obj)
+{
+  return reinterpret_cast<const ReturnT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+
+////// matrix_base
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL matrix (through the base class matrix_base). Non-const version. */
+template<typename NumericT>
+NumericT * cuda_arg(matrix_base<NumericT> & obj)
+{
+  return reinterpret_cast<NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL matrix (through the base class matrix_base). Const version. */
+template<typename NumericT>
+const NumericT * cuda_arg(matrix_base<NumericT> const & obj)
+{
+  return reinterpret_cast<const NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+
+
+////// mem_handle
+
+
+/** @brief Convenience helper function for extracting the CUDA handle from a generic memory handle. Non-const version. */
+template<typename ReturnT>
+ReturnT * cuda_arg(viennacl::backend::mem_handle & h)
+{
+  return reinterpret_cast<ReturnT *>(h.cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a generic memory handle. Const-version. */
+template<typename ReturnT>
+ReturnT const * cuda_arg(viennacl::backend::mem_handle const & h)
+{
+  return reinterpret_cast<const ReturnT *>(h.cuda_handle().get());
+}
+
+/** \cond **/
+template<typename ReturnT>
+ReturnT * cuda_arg(viennacl::backend::mem_handle::cuda_handle_type & h)
+{
+  return reinterpret_cast<ReturnT *>(h.get());
+}
+
+template<typename ReturnT>
+ReturnT const *  cuda_arg(viennacl::backend::mem_handle::cuda_handle_type const & h)
+{
+  return reinterpret_cast<const ReturnT *>(h.get());
+}
+
+inline unsigned int cuda_arg(unsigned int val)  { return val; }
+
+template<typename NumericT> char           cuda_arg(char val)           { return val; }
+template<typename NumericT> unsigned char  cuda_arg(unsigned char val)  { return val; }
+
+template<typename NumericT> short          cuda_arg(short val)          { return val; }
+template<typename NumericT> unsigned short cuda_arg(unsigned short val) { return val; }
+
+template<typename NumericT> int            cuda_arg(int val)            { return val; }
+template<typename NumericT> unsigned int   cuda_arg(unsigned int val)   { return val; }
+
+template<typename NumericT> long           cuda_arg(long val)           { return val; }
+template<typename NumericT> unsigned long  cuda_arg(unsigned long val)  { return val; }
+
+template<typename NumericT> float          cuda_arg(float val)          { return val; }
+template<typename NumericT> double         cuda_arg(double val)         { return val; }
+
+/** \endcond */
+
+
+namespace linalg
+{
+namespace cuda
+{
+
+
+namespace detail
+{
+
+inline unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
+{
+  return static_cast<unsigned int>( ((length > 1) ? (static_cast<unsigned int>(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) );
+}
+
+inline void cuda_last_error_check(const char * message, const char * file, const int line )
+{
+  cudaError_t error_code = cudaGetLastError();
+
+  if (cudaSuccess != error_code)
+  {
+    std::stringstream ss;
+    ss << file << "(" << line << "): " << ": getLastCudaError() CUDA error " << error_code << ": " << cudaGetErrorString( error_code ) << " @ " << message << std::endl;
+    throw viennacl::backend::cuda::cuda_exception(ss.str(), error_code);
+  }
+}
+
+template<typename NumericT>
+struct type_to_type2;
+
+template<>
+struct type_to_type2<float> { typedef float2  type; };
+
+template<>
+struct type_to_type2<double> { typedef double2  type; };
+
+
+template<typename NumericT, typename OtherT>
+typename viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar<NumericT> & s, OtherT) { return s.handle().cuda_handle(); }
+
+template<typename NumericT, typename OtherT>
+typename viennacl::backend::mem_handle::cuda_handle_type const & arg_reference(viennacl::scalar<NumericT> const & s, OtherT) { return s.handle().cuda_handle(); }
+
+// all other cases where T is not a ViennaCL scalar
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              char const &>::type
+arg_reference(ArgT, char const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              unsigned char const &>::type
+arg_reference(ArgT, unsigned char const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              short const &>::type
+arg_reference(ArgT, short const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              unsigned short const &>::type
+arg_reference(ArgT, unsigned short const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              int const &>::type
+arg_reference(ArgT, int const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              unsigned int const &>::type
+arg_reference(ArgT, unsigned int const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              long const &>::type
+arg_reference(ArgT, long const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              unsigned long const &>::type
+arg_reference(ArgT, unsigned long const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              float const &>::type
+arg_reference(ArgT, float const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              double const &>::type
+arg_reference(ArgT, double const & val)  { return val; }
+
+} //namespace detail
+} //namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp
new file mode 100644
index 0000000..ae70f9a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp
@@ -0,0 +1,412 @@
+#ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
+#define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/direct_solve.hpp
+    @brief Implementations of dense direct solvers using CUDA are found here.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+template<typename NumericT>
+__global__ void matrix_matrix_upper_solve_kernel(
+          const NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1, unsigned int A_internal_size2,
+          bool row_major_A,
+
+          NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_size1,  unsigned int B_size2,
+          unsigned int B_internal_size1, unsigned int B_internal_size2,
+          bool row_major_B,
+
+          bool unit_diagonal)
+{
+  NumericT temp;
+  NumericT entry_A;
+
+  for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)
+  {
+    unsigned int row = A_size1 - 1 - row_cnt;
+
+    if (!unit_diagonal)
+    {
+      __syncthreads();
+
+      if (threadIdx.x == 0)
+      {
+        if (row_major_B)
+          B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                                                              : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+        else //if (!row_major_B)
+          B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                                                              : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+      }
+    }
+
+    __syncthreads();
+
+    if (row_major_B)
+      temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
+    else //if (!row_major_B)
+      temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
+
+    //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
+    for  (unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x)
+    {
+      if (row_major_A)
+        entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
+      else //if (!row_major_A)
+        entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
+
+      if (row_major_B)
+        B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
+      else //if (!row_major_B)
+        B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
+
+    }
+  }
+}
+
+
+
+template<typename NumericT>
+__global__ void matrix_matrix_lower_solve_kernel(
+          const NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1, unsigned int A_internal_size2,
+          bool row_major_A,
+
+          NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_size1,  unsigned int B_size2,
+          unsigned int B_internal_size1, unsigned int B_internal_size2,
+          bool row_major_B,
+
+          bool unit_diagonal)
+{
+  NumericT temp;
+  NumericT entry_A;
+
+  for (unsigned int row = 0; row < A_size1; ++row)
+  {
+
+    if (!unit_diagonal)
+    {
+      __syncthreads();
+
+      if (threadIdx.x == 0)
+      {
+        if (row_major_B)
+          B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                                                              : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+        else //if (!row_major_B)
+          B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                                                              : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+      }
+    }
+
+    __syncthreads();
+
+    if (row_major_B)
+      temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
+    else //if (!row_major_B)
+      temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
+
+    //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
+    for  (unsigned int elim = row + threadIdx.x + 1; elim < A_size1; elim += blockDim.x)
+    {
+      if (row_major_A)
+        entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
+      else //if (!row_major_A)
+        entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
+
+      if (row_major_B)
+        B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
+      else //if (!row_major_B)
+        B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
+
+    }
+  }
+}
+
+
+
+
+
+
+namespace detail
+{
+  template<typename TagT>
+  bool is_unit_solve(TagT const & tag) { return false; }
+
+  inline bool is_unit_solve(viennacl::linalg::unit_lower_tag) { return true; }
+  inline bool is_unit_solve(viennacl::linalg::unit_upper_tag) { return true; }
+
+  template<typename TagT>
+  bool is_upper_solve(TagT const & tag) { return false; }
+
+  inline bool is_upper_solve(viennacl::linalg::upper_tag) { return true; }
+  inline bool is_upper_solve(viennacl::linalg::unit_upper_tag) { return true; }
+
+  template<typename Matrix1T, typename Matrix2T, typename SolverTagT>
+  void inplace_solve_impl(Matrix1T const & A,
+                          Matrix2T & B,
+                          SolverTagT const & tag)
+  {
+    typedef typename viennacl::result_of::cpu_value_type<Matrix1T>::type        value_type;
+
+    dim3 threads(128);
+    dim3 grid(B.size2());
+
+    if (is_upper_solve(tag))
+    {
+      matrix_matrix_upper_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A),
+                                                         static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                                         static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                                         static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                                         static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+                                                         bool(A.row_major()),
+
+                                                         viennacl::cuda_arg(B),
+                                                         static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                                                         static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                                                         static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                                                         static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+                                                         bool(B.row_major()),
+
+                                                         is_unit_solve(tag)
+                                                        );
+    }
+    else
+    {
+      matrix_matrix_lower_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A),
+                                                         static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                                         static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                                         static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                                         static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+                                                         bool(A.row_major()),
+
+                                                         viennacl::cuda_arg(B),
+                                                         static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                                                         static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                                                         static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                                                         static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+                                                         bool(B.row_major()),
+
+                                                         is_unit_solve(tag)
+                                                        );
+    }
+
+  }
+}
+
+
+//
+// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
+//
+
+////////////////// triangular solver //////////////////////////////////////
+/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B   (MATLAB notation).
+*
+* @param A         The system matrix
+* @param B         The matrix of row vectors, where the solution is directly written to
+* @param tag       Solver tag for identifying the respective triangular solver
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & A,
+                   matrix_base<NumericT> & B,
+                   SolverTagT tag)
+{
+  detail::inplace_solve_impl(A, B, tag);
+}
+
+
+//
+//  Solve on vector
+//
+
+template<typename NumericT>
+__global__ void triangular_substitute_inplace_row_kernel(
+          NumericT const * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+          NumericT * v,
+          unsigned int v_start,
+          unsigned int v_inc,
+          unsigned int v_size,
+
+          unsigned int options)
+{
+  NumericT temp;
+  unsigned int unit_diagonal_flag  = (options & (1 << 0));
+
+  unsigned int is_lower_solve      = (options & (1 << 2));
+  unsigned int row;
+  for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)    //Note: A required to be square
+  {
+    row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
+    if (!unit_diagonal_flag)
+    {
+      __syncthreads();
+      if (threadIdx.x == 0)
+        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
+    }
+
+    __syncthreads();
+
+    temp = v[row * v_inc + v_start];
+
+    for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
+            elim < (is_lower_solve ? A_size1 : row);
+            elim += blockDim.x)
+      v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row  * A_inc2 + A_start2)];
+  }
+}
+
+
+template<typename NumericT>
+__global__ void triangular_substitute_inplace_col_kernel(
+          NumericT const * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+          NumericT * v,
+          unsigned int v_start,
+          unsigned int v_inc,
+          unsigned int v_size,
+          unsigned int options)
+{
+  NumericT temp;
+  unsigned int unit_diagonal_flag  = (options & (1 << 0));
+
+  unsigned int is_lower_solve      = (options & (1 << 2));
+  unsigned int row;
+  for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)    //Note: A required to be square
+  {
+    row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
+    if (!unit_diagonal_flag)
+    {
+      __syncthreads();
+      if (threadIdx.x == 0)
+        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
+    }
+
+    __syncthreads();
+
+    temp = v[row * v_inc + v_start];
+
+    for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
+            elim < (is_lower_solve ? A_size1 : row);
+            elim += blockDim.x)
+      v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) + (row  * A_inc2 + A_start2) * A_internal_size1];
+  }
+}
+
+
+namespace detail
+{
+  inline unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag)      { return 0; }
+  inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); }
+  inline unsigned int get_option_for_solver_tag(viennacl::linalg::lower_tag)      { return (1 << 2); }
+  inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); }
+
+  template<typename MatrixT, typename VectorT>
+  void inplace_solve_vector_impl(MatrixT const & mat,
+                                 VectorT & vec,
+                                 unsigned int options)
+  {
+    typedef typename viennacl::result_of::cpu_value_type<MatrixT>::type        value_type;
+
+    if (mat.row_major())
+    {
+      triangular_substitute_inplace_row_kernel<<<1, 128>>>(viennacl::cuda_arg(mat),
+                                                           static_cast<unsigned int>(viennacl::traits::start1(mat)),         static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                           static_cast<unsigned int>(viennacl::traits::stride1(mat)),        static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                           static_cast<unsigned int>(viennacl::traits::size1(mat)),          static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                           viennacl::cuda_arg(vec),
+                                                           static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                                           static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                                                           static_cast<unsigned int>(viennacl::traits::size(vec)),
+                                                           options
+                                                          );
+    }
+    else
+    {
+      triangular_substitute_inplace_col_kernel<<<1, 128>>>(viennacl::cuda_arg(mat),
+                                                           static_cast<unsigned int>(viennacl::traits::start1(mat)),         static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                           static_cast<unsigned int>(viennacl::traits::stride1(mat)),        static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                           static_cast<unsigned int>(viennacl::traits::size1(mat)),          static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                           viennacl::cuda_arg(vec),
+                                                           static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                                           static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                                                           static_cast<unsigned int>(viennacl::traits::size(vec)),
+                                                           options
+                                                          );
+    }
+  }
+
+}
+
+/** @brief Direct inplace solver for dense triangular systems (non-transposed version)
+*
+* @param mat       The system matrix proxy
+* @param vec       The load vector, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & mat,
+                   vector_base<NumericT> & vec,
+                   SolverTagT)
+{
+  unsigned int options = detail::get_option_for_solver_tag(SolverTagT());
+
+  detail::inplace_solve_vector_impl(mat, vec, options);
+}
+
+
+}
+}
+}
+
+#endif


[37/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
new file mode 100644
index 0000000..198ac31
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
@@ -0,0 +1,858 @@
+#ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
+
+/* =========================================================================
+  Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/fft_operations.hpp
+    @brief Implementations of Fast Furier Transformation using cuda
+*/
+#include <cmath>
+#include <viennacl/matrix.hpp>
+#include <viennacl/vector.hpp>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+#include "viennacl/linalg/host_based/vector_operations.hpp"
+#include "viennacl/linalg/host_based/fft_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+namespace detail
+{
+  namespace fft
+  {
+    const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
+
+    inline vcl_size_t num_bits(vcl_size_t size)
+    {
+      vcl_size_t bits_datasize = 0;
+      vcl_size_t ds = 1;
+
+      while (ds < size)
+      {
+        ds = ds << 1;
+        bits_datasize++;
+      }
+
+      return bits_datasize;
+    }
+
+    inline vcl_size_t next_power_2(vcl_size_t n)
+    {
+      n = n - 1;
+
+      vcl_size_t power = 1;
+
+      while (power < sizeof(vcl_size_t) * 8)
+      {
+        n = n | (n >> power);
+        power *= 2;
+      }
+
+      return n + 1;
+    }
+
+  } //namespace fft
+} //namespace detail
+
+// addition
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+  return make_float2(a.x + b.x, a.y + b.y);
+}
+
+// subtract
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+  return make_float2(a.x - b.x, a.y - b.y);
+}
+// division
+template<typename SCALARTYPE>
+inline __device__ float2 operator/(float2 a,SCALARTYPE b)
+{
+  return make_float2(a.x/b, a.y/b);
+}
+
+//multiplication
+inline __device__ float2 operator*(float2 in1, float2 in2)
+{
+  return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
+}
+
+// addition
+inline __host__ __device__ double2 operator+(double2 a, double2 b)
+{
+  return make_double2(a.x + b.x, a.y + b.y);
+}
+
+// subtraction
+inline __host__ __device__ double2 operator-(double2 a, double2 b)
+{
+  return make_double2(a.x - b.x, a.y - b.y);
+}
+
+// division
+template<typename SCALARTYPE>
+inline __host__ __device__ double2 operator/(double2 a,SCALARTYPE b)
+{
+  return make_double2(a.x/b, a.y/b);
+}
+
+//multiplication
+inline __host__ __device__ double2 operator*(double2 in1, double2 in2)
+{
+  return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
+}
+
+inline __device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size)
+{
+  v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+  v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+  v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+  v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+  v = (v >> 16) | (v << 16);
+  v = v >> (32 - bit_size);
+  return v;
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_direct(
+    const Numeric2T * input,
+    Numeric2T * output,
+    unsigned int size,
+    unsigned int stride,
+    unsigned int batch_num,
+    NumericT sign,
+    bool is_row_major)
+{
+
+  const NumericT NUM_PI(3.14159265358979323846);
+
+  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
+  {
+    for (unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k += gridDim.x * blockDim.x)
+    {
+      Numeric2T f;
+      f.x = 0;
+      f.y = 0;
+
+      for (unsigned int n = 0; n < size; n++)
+      {
+        Numeric2T in;
+        if (!is_row_major)
+          in = input[batch_id * stride + n];   //input index here
+        else
+          in = input[n * stride + batch_id];//input index here
+
+        NumericT sn,cs;
+        NumericT arg = sign * 2 * NUM_PI * k / size * n;
+        sn = sin(arg);
+        cs = cos(arg);
+
+        Numeric2T ex;
+        ex.x = cs;
+        ex.y = sn;
+        Numeric2T tmp;
+        tmp.x = in.x * ex.x - in.y * ex.y;
+        tmp.y = in.x * ex.y + in.y * ex.x;
+        f = f + tmp;
+      }
+
+      if (!is_row_major)
+        output[batch_id * stride + k] = f; // output index here
+      else
+        output[k * stride + batch_id] = f;// output index here
+    }
+  }
+}
+
+/**
+ * @brief Direct 1D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::vector<NumericT, AlignmentV> const & in,
+            viennacl::vector<NumericT, AlignmentV>       & out,
+            vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
+            NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
+                          reinterpret_cast<      numeric2_type *>(viennacl::cuda_arg(out)),
+                          static_cast<unsigned int>(size),
+                          static_cast<unsigned int>(stride),
+                          static_cast<unsigned int>(batch_num),
+                          sign,
+                          bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
+}
+
+/**
+ * @brief Direct 2D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in,
+            viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>       & out,
+            vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
+            NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
+                          reinterpret_cast<      numeric2_type *>(viennacl::cuda_arg(out)),
+                          static_cast<unsigned int>(size),
+                          static_cast<unsigned int>(stride),
+                          static_cast<unsigned int>(batch_num),
+                          sign,
+                          bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
+}
+
+template<typename NumericT>
+__global__ void fft_reorder(NumericT * input,
+                            unsigned int bit_size,
+                            unsigned int size,
+                            unsigned int stride,
+                            unsigned int batch_num,
+                            bool is_row_major)
+{
+
+  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int glb_sz = gridDim.x * blockDim.x;
+
+  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
+  {
+    for (unsigned int i = glb_id; i < size; i += glb_sz)
+    {
+      unsigned int v = get_reorder_num(i, bit_size);
+
+      if (i < v)
+      {
+        if (!is_row_major)
+        {
+          NumericT tmp = input[batch_id * stride + i];    // index
+          input[batch_id * stride + i] = input[batch_id * stride + v];//index
+          input[batch_id * stride + v] = tmp;//index
+        }
+        else
+        {
+          NumericT tmp = input[i * stride + batch_id];
+          input[i * stride + batch_id] = input[v * stride + batch_id];
+          input[v * stride + batch_id] = tmp;
+        }
+      }
+    }
+  }
+}
+
+/***
+ * This function performs reorder of input data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void reorder(viennacl::vector<NumericT, AlignmentV> & in,
+             vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num,
+             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                           static_cast<unsigned int>(bits_datasize),
+                           static_cast<unsigned int>(size),
+                           static_cast<unsigned int>(stride),
+                           static_cast<unsigned int>(batch_num),
+                           static_cast<bool>(data_order));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_radix2_local(Numeric2T * input,
+                                 unsigned int bit_size,
+                                 unsigned int size,
+                                 unsigned int stride,
+                                 unsigned int batch_num,
+                                 NumericT sign,
+                                 bool is_row_major)
+{
+  __shared__ Numeric2T lcl_input[1024];
+  unsigned int grp_id = blockIdx.x;
+  unsigned int grp_num = gridDim.x;
+
+  unsigned int lcl_sz = blockDim.x;
+  unsigned int lcl_id = threadIdx.x;
+  const NumericT NUM_PI(3.14159265358979323846);
+
+  for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num)
+  {
+    for (unsigned int p = lcl_id; p < size; p += lcl_sz)
+    {
+      unsigned int v = get_reorder_num(p, bit_size);
+      if (!is_row_major)
+        lcl_input[v] = input[batch_id * stride + p];
+      else
+        lcl_input[v] = input[p * stride + batch_id];
+    }
+
+    __syncthreads();
+
+    //performs Cooley-Tukey FFT on local arrayfft
+    for (unsigned int s = 0; s < bit_size; s++)
+    {
+      unsigned int ss = 1 << s;
+      NumericT cs, sn;
+      for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz)
+      {
+        unsigned int group = (tid & (ss - 1));
+        unsigned int pos = ((tid >> s) << (s + 1)) + group;
+
+        Numeric2T in1 = lcl_input[pos];
+        Numeric2T in2 = lcl_input[pos + ss];
+
+        NumericT arg = group * sign * NUM_PI / ss;
+
+        sn = sin(arg);
+        cs = cos(arg);
+        Numeric2T ex;
+        ex.x = cs;
+        ex.y = sn;
+
+        Numeric2T tmp;
+        tmp.x = in2.x * ex.x - in2.y * ex.y;
+        tmp.y = in2.x * ex.y + in2.y * ex.x;
+
+        lcl_input[pos + ss] = in1 - tmp;
+        lcl_input[pos]      = in1 + tmp;
+      }
+      __syncthreads();
+    }
+
+    //copy local array back to global memory
+    for (unsigned int p = lcl_id; p < size; p += lcl_sz)
+    {
+      if (!is_row_major)
+        input[batch_id * stride + p] = lcl_input[p];   //index
+      else
+        input[p * stride + batch_id] = lcl_input[p];
+    }
+
+  }
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_radix2(Numeric2T * input,
+                           unsigned int s,
+                           unsigned int bit_size,
+                           unsigned int size,
+                           unsigned int stride,
+                           unsigned int batch_num,
+                           NumericT sign,
+                           bool is_row_major)
+{
+
+  unsigned int ss = 1 << s;
+  unsigned int half_size = size >> 1;
+
+  NumericT cs, sn;
+  const NumericT NUM_PI(3.14159265358979323846);
+
+  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int glb_sz = gridDim.x * blockDim.x;
+
+  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
+  {
+    for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz)
+    {
+      unsigned int group = (tid & (ss - 1));
+      unsigned int pos = ((tid >> s) << (s + 1)) + group;
+      Numeric2T in1;
+      Numeric2T in2;
+      unsigned int offset;
+      if (!is_row_major)
+      {
+        offset = batch_id * stride + pos;
+        in1 = input[offset];   //index
+        in2 = input[offset + ss];//index
+      }
+      else
+      {
+        offset = pos * stride + batch_id;
+        in1 = input[offset];   //index
+        in2 = input[offset + ss * stride];//index
+      }
+
+      NumericT arg = group * sign * NUM_PI / ss;
+
+      sn = sin(arg);
+      cs = cos(arg);
+
+      Numeric2T ex;
+      ex.x = cs;
+      ex.y = sn;
+
+      Numeric2T tmp;
+      tmp.x = in2.x * ex.x - in2.y * ex.y;
+      tmp.y = in2.x * ex.y + in2.y * ex.x;
+
+      if (!is_row_major)
+        input[offset + ss] = in1 - tmp;  //index
+      else
+        input[offset + ss * stride] = in1 - tmp;  //index
+      input[offset] = in1 + tmp;  //index
+    }
+  }
+}
+
+/**
+ * @brief Radix-2 1D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::vector<NumericT, AlignmentV> & in,
+            vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
+
+  if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
+  {
+    fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                                  static_cast<unsigned int>(bit_size),
+                                  static_cast<unsigned int>(size),
+                                  static_cast<unsigned int>(stride),
+                                  static_cast<unsigned int>(batch_num),
+                                  static_cast<NumericT>(sign),
+                                  static_cast<bool>(data_order));
+    VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
+  }
+  else
+  {
+    fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                             static_cast<unsigned int>(bit_size),
+                             static_cast<unsigned int>(size),
+                             static_cast<unsigned int>(stride),
+                             static_cast<unsigned int>(batch_num),
+                             static_cast<bool>(data_order));
+    VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
+
+    for (vcl_size_t step = 0; step < bit_size; step++)
+    {
+      fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                              static_cast<unsigned int>(step),
+                              static_cast<unsigned int>(bit_size),
+                              static_cast<unsigned int>(size),
+                              static_cast<unsigned int>(stride),
+                              static_cast<unsigned int>(batch_num),
+                              sign,
+                              static_cast<bool>(data_order));
+      VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
+    }
+  }
+}
+
+/**
+ * @brief Radix-2 2D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in,
+            vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
+
+  if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
+  {
+    fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                                  static_cast<unsigned int>(bit_size),
+                                  static_cast<unsigned int>(size),
+                                  static_cast<unsigned int>(stride),
+                                  static_cast<unsigned int>(batch_num),
+                                  sign,
+                                  static_cast<bool>(data_order));
+    VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
+  }
+  else
+  {
+    fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                             static_cast<unsigned int>(bit_size),
+                             static_cast<unsigned int>(size),
+                             static_cast<unsigned int>(stride),
+                             static_cast<unsigned int>(batch_num),
+                             static_cast<bool>(data_order));
+    VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
+    for (vcl_size_t step = 0; step < bit_size; step++)
+    {
+      fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                              static_cast<unsigned int>(step),
+                              static_cast<unsigned int>(bit_size),
+                              static_cast<unsigned int>(size),
+                              static_cast<unsigned int>(stride),
+                              static_cast<unsigned int>(batch_num),
+                              sign,
+                              static_cast<bool>(data_order));
+      VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
+    }
+  }
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void bluestein_post(Numeric2T * Z, Numeric2T * out, unsigned int size, NumericT sign)
+{
+  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int glb_sz =gridDim.x * blockDim.x;
+
+  unsigned int double_size = size << 1;
+  NumericT sn_a, cs_a;
+  const NumericT NUM_PI(3.14159265358979323846);
+
+  for (unsigned int i = glb_id; i < size; i += glb_sz)
+  {
+    unsigned int rm = i * i % (double_size);
+    NumericT angle = (NumericT)rm / size * (-NUM_PI);
+
+    sn_a = sin(angle);
+    cs_a= cos(angle);
+
+    Numeric2T b_i;
+    b_i.x = cs_a;
+    b_i.y = sn_a;
+    out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y;
+    out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x;
+  }
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B,
+                              unsigned int size, unsigned int ext_size, NumericT sign)
+{
+  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int glb_sz = gridDim.x * blockDim.x;
+
+  unsigned int double_size = size << 1;
+
+  NumericT sn_a, cs_a;
+  const NumericT NUM_PI(3.14159265358979323846);
+
+  for (unsigned int i = glb_id; i < size; i += glb_sz)
+  {
+    unsigned int rm = i * i % (double_size);
+    NumericT angle = (NumericT)rm / size * NUM_PI;
+
+    sn_a = sin(-angle);
+    cs_a= cos(-angle);
+
+    Numeric2T a_i;
+    a_i.x =cs_a;
+    a_i.y =sn_a;
+
+    Numeric2T b_i;
+    b_i.x =cs_a;
+    b_i.y =-sn_a;
+
+    A[i].x = input[i].x * a_i.x - input[i].y * a_i.y;
+    A[i].y = input[i].x * a_i.y + input[i].y * a_i.x;
+    B[i] = b_i;
+
+    // very bad instruction, to be fixed
+    if (i)
+    B[ext_size - i] = b_i;
+  }
+}
+
+template<typename NumericT>
+__global__ void zero2(NumericT * input1, NumericT * input2, unsigned int size)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+  {
+    input1[i].x = 0;
+    input1[i].y = 0;
+
+    input2[i].x = 0;
+    input2[i].y = 0;
+  }
+}
+
+/**
+ * @brief Bluestein's algorithm for computing Fourier transformation.
+ *
+ * Currently,  Works only for sizes of input data which less than 2^16.
+ * Uses a lot of additional memory, but should be fast for any size of data.
+ * Serial implementation has something about o(n * lg n) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void bluestein(viennacl::vector<NumericT, AlignmentV> & in,
+               viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t /*batch_num*/)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  vcl_size_t size = in.size() >> 1;
+  vcl_size_t ext_size = viennacl::linalg::cuda::detail::fft::next_power_2(2 * size - 1);
+
+  viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
+  viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
+  viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
+
+  zero2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
+                     reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
+                     static_cast<unsigned int>(ext_size));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("zero2");
+
+  bluestein_pre<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+                             reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
+                             reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
+                             static_cast<unsigned int>(size),
+                             static_cast<unsigned int>(ext_size),
+                             NumericT(1));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_pre");
+
+  viennacl::linalg::convolve_i(A, B, Z);
+
+  bluestein_post<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(Z)),
+                              reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
+                              static_cast<unsigned int>(size),
+                              NumericT(1));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_post");
+}
+
+template<typename NumericT>
+__global__ void fft_mult_vec(const NumericT * input1,
+                             const NumericT * input2,
+                             NumericT * output,
+                             unsigned int size)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+  {
+    NumericT in1 = input1[i];
+    NumericT in2 = input2[i];
+    output[i] = in1 * in2;
+  }
+}
+
+/**
+ * @brief Mutiply two complex vectors and store result in output
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
+                      viennacl::vector<NumericT, AlignmentV> const & input2,
+                      viennacl::vector<NumericT, AlignmentV> & output)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  vcl_size_t size = input1.size() / 2;
+
+  fft_mult_vec<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input1)),
+                            reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input2)),
+                            reinterpret_cast<      numeric2_type *>(viennacl::cuda_arg(output)),
+                            static_cast<unsigned int>(size));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_mult_vec");
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_div_vec_scalar(Numeric2T * input1, unsigned int size, NumericT factor)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x*blockDim.x)
+    input1[i] = input1[i]/factor;
+}
+
+/**
+ * @brief Normalize vector on with his own size
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void normalize(viennacl::vector<NumericT, AlignmentV> & input)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  vcl_size_t size = input.size() >> 1;
+  NumericT norm_factor = static_cast<NumericT>(size);
+  fft_div_vec_scalar<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
+                                  static_cast<unsigned int>(size),
+                                  norm_factor);
+  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_div_vec_scalar");
+}
+
+template<typename NumericT>
+__global__ void transpose(const NumericT * input,
+                          NumericT * output,
+                          unsigned int row_num,
+                          unsigned int col_num)
+{
+  unsigned int size = row_num * col_num;
+  for (unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
+  {
+    unsigned int row = i / col_num;
+    unsigned int col = i - row*col_num;
+    unsigned int new_pos = col * row_num + row;
+    output[new_pos] = input[i];
+  }
+}
+
+/**
+ * @brief Transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
+               viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  transpose<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input)),
+                         reinterpret_cast<      numeric2_type *>(viennacl::cuda_arg(output)),
+                         static_cast<unsigned int>(input.internal_size1()>>1),
+                         static_cast<unsigned int>(input.internal_size2()>>1));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("transpose");
+
+}
+
+template<typename NumericT>
+__global__ void transpose_inplace(
+    NumericT * input,
+    unsigned int row_num,
+    unsigned int col_num)
+{
+  unsigned int size = row_num * col_num;
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
+  {
+    unsigned int row = i / col_num;
+    unsigned int col = i - row*col_num;
+    unsigned int new_pos = col * row_num + row;
+    if (i < new_pos)
+    {
+      NumericT val = input[i];
+      input[i] = input[new_pos];
+      input[new_pos] = val;
+    }
+  }
+}
+
+/**
+ * @brief Inplace_transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  transpose_inplace<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
+                                 static_cast<unsigned int>(input.internal_size1()>>1),
+                                 static_cast<unsigned int>(input.internal_size2() >> 1));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("transpose_inplace");
+
+}
+
+template<typename RealT,typename ComplexT>
+__global__ void real_to_complex(const RealT * in, ComplexT * out, unsigned int size)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+  {
+    ComplexT val;
+    val.x = in[i];
+    val.y = 0;
+    out[i] = val;
+  }
+}
+
+/**
+ * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void real_to_complex(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT> & out, vcl_size_t size)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  real_to_complex<<<128,128>>>(viennacl::cuda_arg(in),
+                               reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
+                               static_cast<unsigned int>(size));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("real_to_complex");
+}
+
+template<typename ComplexT,typename RealT>
+__global__ void complex_to_real(const ComplexT * in, RealT * out, unsigned int size)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+    out[i] = in[i].x;
+}
+
+/**
+ * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void complex_to_real(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT>& out, vcl_size_t size)
+{
+  typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
+
+  complex_to_real<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
+                               viennacl::cuda_arg(out),
+                               static_cast<unsigned int>(size));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("complex_to_real");
+
+}
+
+template<typename NumericT>
+__global__ void reverse_inplace(NumericT * vec, unsigned int size)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 1); i+=gridDim.x * blockDim.x)
+  {
+    NumericT val1 = vec[i];
+    NumericT val2 = vec[size - i - 1];
+    vec[i] = val2;
+    vec[size - i - 1] = val1;
+  }
+}
+
+/**
+ * @brief Reverse vector to oposite order and save it in input vector
+ */
+template<typename NumericT>
+void reverse(viennacl::vector_base<NumericT>& in)
+{
+  vcl_size_t size = in.size();
+  reverse_inplace<<<128,128>>>(viennacl::cuda_arg(in), static_cast<unsigned int>(size));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("reverse_inplace");
+}
+
+}  //namespace cuda
+}  //namespace linalg
+}  //namespace viennacl
+
+#endif /* FFT_OPERATIONS_HPP_ */

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
new file mode 100644
index 0000000..302a73c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
@@ -0,0 +1,666 @@
+#ifndef VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/ilu_operations.hpp
+    @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using CUDA
+*/
+
+#include <cmath>
+#include <algorithm>  //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+#include "viennacl/traits/stride.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+template<typename IndexT> // to control external linkage
+__global__ void extract_L_kernel_1(
+          const IndexT * A_row_indices,
+          const IndexT * A_col_indices,
+          unsigned int A_size1,
+          unsigned int * L_row_indices)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < A_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = A_row_indices[row];
+    unsigned int row_end   = A_row_indices[row+1];
+
+    unsigned int num_entries_L = 0;
+    for (unsigned int j=row_begin; j<row_end; ++j)
+    {
+      unsigned int col = A_col_indices[j];
+      if (col <= row)
+        ++num_entries_L;
+    }
+
+    L_row_indices[row] = num_entries_L;
+  }
+}
+
+template<typename NumericT>
+__global__ void extract_L_kernel_2(
+          unsigned int const *A_row_indices,
+          unsigned int const *A_col_indices,
+          NumericT     const *A_elements,
+          unsigned int A_size1,
+
+          unsigned int const *L_row_indices,
+          unsigned int       *L_col_indices,
+          NumericT           *L_elements)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < A_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = A_row_indices[row];
+    unsigned int row_end   = A_row_indices[row+1];
+
+    unsigned int index_L = L_row_indices[row];
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      unsigned int col = A_col_indices[j];
+      NumericT value = A_elements[j];
+
+      if (col <= row)
+      {
+        L_col_indices[index_L] = col;
+        L_elements[index_L]    = value;
+        ++index_L;
+      }
+    }
+  }
+}
+
+template<typename NumericT>
+void extract_L(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L)
+{
+  //
+  // Step 1: Count elements in L and U:
+  //
+  extract_L_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                   static_cast<unsigned int>(A.size1()),
+                                   viennacl::cuda_arg<unsigned int>(L.handle1())
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_1");
+
+  //
+  // Step 2: Exclusive scan on row_buffers:
+  //
+  viennacl::vector<unsigned int> wrapped_L_row_buffer(viennacl::cuda_arg<unsigned int>(L.handle1().cuda_handle()), viennacl::CUDA_MEMORY, A.size1() + 1);
+  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+  L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+  //
+  // Step 3: Write entries
+  //
+  extract_L_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                   viennacl::cuda_arg<NumericT>(A.handle()),
+                                   static_cast<unsigned int>(A.size1()),
+                                   viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(L.handle2()),
+                                   viennacl::cuda_arg<NumericT>(L.handle())
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_2");
+
+  L.generate_row_block_information();
+
+} // extract_L
+
+///////////////////////////////////////////////
+
+
+template<typename NumericT>
+__global__ void ilu_scale_kernel_1(
+          unsigned int const *A_row_indices,
+          unsigned int const *A_col_indices,
+          NumericT     const *A_elements,
+          unsigned int A_size1,
+
+          NumericT           *D_elements)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < A_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = A_row_indices[row];
+    unsigned int row_end   = A_row_indices[row+1];
+
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      unsigned int col = A_col_indices[j];
+      if (row == col)
+      {
+        D_elements[row] = NumericT(1) / sqrt(fabs(A_elements[j]));
+        break;
+      }
+    }
+  }
+}
+
+/** @brief Scales values in a matrix such that output = D * input * D, where D is a diagonal matrix (only the diagonal is provided) */
+template<typename NumericT>
+__global__ void ilu_scale_kernel_2(
+          unsigned int const *R_row_indices,
+          unsigned int const *R_col_indices,
+          NumericT           *R_elements,
+          unsigned int R_size1,
+
+          NumericT           *D_elements)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < R_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = R_row_indices[row];
+    unsigned int row_end   = R_row_indices[row+1];
+
+    NumericT D_row = D_elements[row];
+
+    for (unsigned int j = row_begin; j < row_end; ++j)
+      R_elements[j] *= D_row * D_elements[R_col_indices[j]];
+  }
+}
+
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void icc_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L)
+{
+  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+  // fill D:
+  ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                   viennacl::cuda_arg<NumericT>(A.handle()),
+                                   static_cast<unsigned int>(A.size1()),
+                                   viennacl::cuda_arg(D)
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
+
+  // scale L:
+  ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(L.handle2()),
+                                   viennacl::cuda_arg<NumericT>(L.handle()),
+                                   static_cast<unsigned int>(L.size1()),
+                                   viennacl::cuda_arg(D)
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
+}
+
+/////////////////////////////////////
+
+/** @brief CUDA kernel for one Chow-Patel-ICC sweep */
+template<typename NumericT>
+__global__ void icc_chow_patel_sweep_kernel(
+          unsigned int const *L_row_indices,
+          unsigned int const *L_col_indices,
+          NumericT           *L_elements,
+          NumericT     const *L_backup,
+          unsigned int L_size1,
+          NumericT     const *aij_L)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < L_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    //
+    // update L:
+    //
+    unsigned int row_Li_start = L_row_indices[row];
+    unsigned int row_Li_end   = L_row_indices[row + 1];
+
+    for (unsigned int i = row_Li_start; i < row_Li_end; ++i)
+    {
+      unsigned int col = L_col_indices[i];
+
+      unsigned int row_Lj_start = L_row_indices[col];
+      unsigned int row_Lj_end   = L_row_indices[col + 1];
+
+      // compute \sum_{k=1}^{j-1} l_ik u_kj
+      unsigned int index_Lj = row_Lj_start;
+      unsigned int col_Lj = L_col_indices[index_Lj];
+      NumericT s = aij_L[i];
+      for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li)
+      {
+        unsigned int col_Li = L_col_indices[index_Li];
+
+        // find element in U
+        while (col_Lj < col_Li)
+        {
+          ++index_Lj;
+          col_Lj = L_col_indices[index_Lj];
+        }
+
+        if (col_Lj == col_Li)
+          s -= L_backup[index_Li] * L_backup[index_Lj];
+      }
+
+      // update l_ij:
+      L_elements[i] = (row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]);  // diagonal element is last entry in U
+    }
+
+  }
+}
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void icc_chow_patel_sweep(compressed_matrix<NumericT>       & L,
+                          vector<NumericT>            const & aij_L)
+{
+  viennacl::backend::mem_handle L_backup;
+  viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+  viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+  icc_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                            viennacl::cuda_arg<unsigned int>(L.handle2()),
+                                            viennacl::cuda_arg<NumericT>(L.handle()),
+                                            viennacl::cuda_arg<NumericT>(L_backup),
+                                            static_cast<unsigned int>(L.size1()),
+
+                                            viennacl::cuda_arg<NumericT>(aij_L.handle())
+                                           );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("icc_chow_patel_sweep_kernel");
+
+}
+
+
+////////////////////////////// ILU ///////////////////////////
+
+template<typename IndexT> // to control external linkage
+__global__ void extract_LU_kernel_1(
+          const IndexT * A_row_indices,
+          const IndexT * A_col_indices,
+          unsigned int A_size1,
+
+          unsigned int * L_row_indices,
+
+          unsigned int * U_row_indices)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < A_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = A_row_indices[row];
+    unsigned int row_end   = A_row_indices[row+1];
+
+    unsigned int num_entries_L = 0;
+    unsigned int num_entries_U = 0;
+    for (unsigned int j=row_begin; j<row_end; ++j)
+    {
+      unsigned int col = A_col_indices[j];
+      if (col <= row)
+        ++num_entries_L;
+      if (col >= row)
+        ++num_entries_U;
+    }
+
+    L_row_indices[row] = num_entries_L;
+    U_row_indices[row] = num_entries_U;
+  }
+}
+
+template<typename NumericT>
+__global__ void extract_LU_kernel_2(
+          unsigned int const *A_row_indices,
+          unsigned int const *A_col_indices,
+          NumericT     const *A_elements,
+          unsigned int A_size1,
+
+          unsigned int const *L_row_indices,
+          unsigned int       *L_col_indices,
+          NumericT           *L_elements,
+
+          unsigned int const *U_row_indices,
+          unsigned int       *U_col_indices,
+          NumericT           *U_elements)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < A_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = A_row_indices[row];
+    unsigned int row_end   = A_row_indices[row+1];
+
+    unsigned int index_L = L_row_indices[row];
+    unsigned int index_U = U_row_indices[row];
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      unsigned int col = A_col_indices[j];
+      NumericT value = A_elements[j];
+
+      if (col <= row)
+      {
+        L_col_indices[index_L] = col;
+        L_elements[index_L]    = value;
+        ++index_L;
+      }
+
+      if (col >= row)
+      {
+        U_col_indices[index_U] = col;
+        U_elements[index_U]    = value;
+        ++index_U;
+      }
+    }
+  }
+}
+
+template<typename NumericT>
+void extract_LU(compressed_matrix<NumericT> const & A,
+                compressed_matrix<NumericT>       & L,
+                compressed_matrix<NumericT>       & U)
+{
+  //
+  // Step 1: Count elements in L and U:
+  //
+  extract_LU_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                    viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                    static_cast<unsigned int>(A.size1()),
+                                    viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                    viennacl::cuda_arg<unsigned int>(U.handle1())
+                                   );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_1");
+
+  //
+  // Step 2: Exclusive scan on row_buffers:
+  //
+  viennacl::vector<unsigned int> wrapped_L_row_buffer(viennacl::cuda_arg<unsigned int>(L.handle1()), viennacl::CUDA_MEMORY, A.size1() + 1);
+  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+  L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+  viennacl::vector<unsigned int> wrapped_U_row_buffer(viennacl::cuda_arg<unsigned int>(U.handle1()), viennacl::CUDA_MEMORY, A.size1() + 1);
+  viennacl::linalg::exclusive_scan(wrapped_U_row_buffer, wrapped_U_row_buffer);
+  U.reserve(wrapped_U_row_buffer[U.size1()], false);
+
+  //
+  // Step 3: Write entries
+  //
+  extract_LU_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                    viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                    viennacl::cuda_arg<NumericT>(A.handle()),
+                                    static_cast<unsigned int>(A.size1()),
+                                    viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                    viennacl::cuda_arg<unsigned int>(L.handle2()),
+                                    viennacl::cuda_arg<NumericT>(L.handle()),
+                                    viennacl::cuda_arg<unsigned int>(U.handle1()),
+                                    viennacl::cuda_arg<unsigned int>(U.handle2()),
+                                    viennacl::cuda_arg<NumericT>(U.handle())
+                                   );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_2");
+
+  L.generate_row_block_information();
+  // Note: block information for U will be generated after transposition
+
+} // extract_LU
+
+///////////////////////////////////////////////
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void ilu_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L,
+               compressed_matrix<NumericT>       & U)
+{
+  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+  // fill D:
+  ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(A.handle2()),
+                                   viennacl::cuda_arg<NumericT>(A.handle()),
+                                   static_cast<unsigned int>(A.size1()),
+                                   viennacl::cuda_arg<NumericT>(D.handle())
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
+
+  // scale L:
+  ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(L.handle2()),
+                                   viennacl::cuda_arg<NumericT>(L.handle()),
+                                   static_cast<unsigned int>(L.size1()),
+                                   viennacl::cuda_arg<NumericT>(D.handle())
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2");
+
+  // scale U:
+  ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(U.handle1()),
+                                   viennacl::cuda_arg<unsigned int>(U.handle2()),
+                                   viennacl::cuda_arg<NumericT>(U.handle()),
+                                   static_cast<unsigned int>(U.size1()),
+                                   viennacl::cuda_arg<NumericT>(D.handle())
+                                  );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2");
+}
+
+/////////////////////////////////////
+
+/** @brief CUDA kernel for one Chow-Patel-ILU sweep */
+template<typename NumericT>
+__global__ void ilu_chow_patel_sweep_kernel(
+          unsigned int const *L_row_indices,
+          unsigned int const *L_col_indices,
+          NumericT           *L_elements,
+          NumericT     const *L_backup,
+          unsigned int L_size1,
+
+          NumericT     const *aij_L,
+
+          unsigned int const *U_trans_row_indices,
+          unsigned int const *U_trans_col_indices,
+          NumericT           *U_trans_elements,
+          NumericT     const *U_trans_backup,
+
+          NumericT     const *aij_U_trans)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < L_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    //
+    // update L:
+    //
+    unsigned int row_L_start = L_row_indices[row];
+    unsigned int row_L_end   = L_row_indices[row + 1];
+
+    for (unsigned int j = row_L_start; j < row_L_end; ++j)
+    {
+      unsigned int col = L_col_indices[j];
+
+      if (col == row)
+        continue;
+
+      unsigned int row_U_start = U_trans_row_indices[col];
+      unsigned int row_U_end   = U_trans_row_indices[col + 1];
+
+      // compute \sum_{k=1}^{j-1} l_ik u_kj
+      unsigned int index_U = row_U_start;
+      unsigned int col_U = (index_U < row_U_end) ? U_trans_col_indices[index_U] : L_size1;
+      NumericT sum = 0;
+      for (unsigned int k = row_L_start; k < j; ++k)
+      {
+        unsigned int col_L = L_col_indices[k];
+
+        // find element in U
+        while (col_U < col_L)
+        {
+          ++index_U;
+          col_U = U_trans_col_indices[index_U];
+        }
+
+        if (col_U == col_L)
+          sum += L_backup[k] * U_trans_backup[index_U];
+      }
+
+      // update l_ij:
+      L_elements[j] = (aij_L[j] - sum) / U_trans_backup[row_U_end - 1];  // diagonal element is last entry in U
+    }
+
+
+    //
+    // update U:
+    //
+    unsigned int row_U_start = U_trans_row_indices[row];
+    unsigned int row_U_end   = U_trans_row_indices[row + 1];
+    for (unsigned int j = row_U_start; j < row_U_end; ++j)
+    {
+      unsigned int col = U_trans_col_indices[j];
+
+      row_L_start = L_row_indices[col];
+      row_L_end   = L_row_indices[col + 1];
+
+      // compute \sum_{k=1}^{j-1} l_ik u_kj
+      unsigned int index_L = row_L_start;
+      unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : L_size1;
+      NumericT sum = 0;
+      for (unsigned int k = row_U_start; k < j; ++k)
+      {
+        unsigned int col_U = U_trans_col_indices[k];
+
+        // find element in L
+        while (col_L < col_U)
+        {
+          ++index_L;
+          col_L = L_col_indices[index_L];
+        }
+
+        if (col_U == col_L)
+          sum += L_backup[index_L] * U_trans_backup[k];
+      }
+
+      // update u_ij:
+      U_trans_elements[j] = aij_U_trans[j] - sum;
+    }
+  }
+}
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void ilu_chow_patel_sweep(compressed_matrix<NumericT>       & L,
+                          vector<NumericT>            const & aij_L,
+                          compressed_matrix<NumericT>       & U_trans,
+                          vector<NumericT>            const & aij_U_trans)
+{
+  viennacl::backend::mem_handle L_backup;
+  viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+  viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+  viennacl::backend::mem_handle U_backup;
+  viennacl::backend::memory_create(U_backup, U_trans.handle().raw_size(), viennacl::traits::context(U_trans));
+  viennacl::backend::memory_copy(U_trans.handle(), U_backup, 0, 0, U_trans.handle().raw_size());
+
+  ilu_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+                                            viennacl::cuda_arg<unsigned int>(L.handle2()),
+                                            viennacl::cuda_arg<NumericT>(L.handle()),
+                                            viennacl::cuda_arg<NumericT>(L_backup),
+                                            static_cast<unsigned int>(L.size1()),
+
+                                            viennacl::cuda_arg<NumericT>(aij_L.handle()),
+
+                                            viennacl::cuda_arg<unsigned int>(U_trans.handle1()),
+                                            viennacl::cuda_arg<unsigned int>(U_trans.handle2()),
+                                            viennacl::cuda_arg<NumericT>(U_trans.handle()),
+                                            viennacl::cuda_arg<NumericT>(U_backup),
+
+                                            viennacl::cuda_arg<NumericT>(aij_U_trans.handle())
+                                           );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_chow_patel_sweep_kernel");
+
+}
+
+//////////////////////////////////////
+
+template<typename NumericT>
+__global__ void ilu_form_neumann_matrix_kernel(
+          unsigned int const *R_row_indices,
+          unsigned int const *R_col_indices,
+          NumericT           *R_elements,
+          unsigned int R_size1,
+
+          NumericT           *D_elements)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < R_size1;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_begin = R_row_indices[row];
+    unsigned int row_end   = R_row_indices[row+1];
+
+    // part 1: extract diagonal entry
+    NumericT diag = 0;
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      unsigned int col = R_col_indices[j];
+      if (col == row)
+      {
+        diag = R_elements[j];
+        R_elements[j] = 0; // (I - D^{-1}R)
+        break;
+      }
+    }
+    D_elements[row] = diag;
+
+    // part2: scale
+    for (unsigned int j = row_begin; j < row_end; ++j)
+      R_elements[j] /= -diag;
+  }
+}
+
+
+
+template<typename NumericT>
+void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
+                             vector<NumericT> & diag_R)
+{
+  ilu_form_neumann_matrix_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(R.handle1()),
+                                               viennacl::cuda_arg<unsigned int>(R.handle2()),
+                                               viennacl::cuda_arg<NumericT>(R.handle()),
+                                               static_cast<unsigned int>(R.size1()),
+                                               viennacl::cuda_arg<NumericT>(diag_R.handle())
+                                              );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_form_neumann_matrix_kernel");
+}
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif